howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28import math
   29
   30from howard.functions.commons import *
   31from howard.objects.database import *
   32from howard.functions.databases import *
   33from howard.functions.utils import *
   34
   35
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(field)
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099        fields_to_rename: dict | None = None
 2100    ) -> bool:
 2101        """
 2102        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2103        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2104        partitioning.
 2105        
 2106        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2107        output file where the exported data will be saved
 2108        :type output_file: str | None
 2109        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2110        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2111        header will be exported to a file with the same name as the `output_file` parameter, but with
 2112        the extension "
 2113        :type output_header: str | None
 2114        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2115        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2116        True, the header will be exported to a file. If `export_header` is False, the header will not
 2117        be, defaults to True
 2118        :type export_header: bool (optional)
 2119        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2120        that can be used to filter and select specific data from the VCF file before exporting it. If
 2121        provided, only the data that matches the query will be exported. This allows you to customize
 2122        the exported data based on
 2123        :type query: str | None
 2124        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2125        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2126        organize data in a hierarchical directory structure based on the values of one or more columns.
 2127        This can improve query performance when working with large datasets
 2128        :type parquet_partitions: list | None
 2129        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2130        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2131        multiple files. It helps in optimizing the export process by breaking down the data into
 2132        manageable chunks for processing and storage
 2133        :type chunk_size: int | None
 2134        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2135        threads to be used during the export process. It determines the level of parallelism and can
 2136        improve the performance of the export operation. If this parameter is not provided, the function
 2137        will use the default number of threads
 2138        :type threads: int | None
 2139        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2140        determines whether the output file should be sorted based on genomic coordinates of the
 2141        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2142        `False`,, defaults to False
 2143        :type sort: bool (optional)
 2144        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2145        determines whether an index should be created on the output file. If `index` is set to `True`,
 2146        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2147        :type index: bool (optional)
 2148        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2149        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2150        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2151        output file should be
 2152        :type order_by: str | None
 2153        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2154        mapping of field names to be renamed during the export process. This parameter allows you to
 2155        customize the output field names before exporting the data. Each key-value pair in the
 2156        dictionary represents the original field name as the key and the new field name
 2157        :type fields_to_rename: dict | None
 2158        :return: The `export_output` function returns a boolean value. It checks if the output file
 2159        exists and returns True if it does, or None if it doesn't.
 2160        """
 2161
 2162        # Log
 2163        log.info("Exporting...")
 2164
 2165        # Full path
 2166        output_file = full_path(output_file)
 2167        output_header = full_path(output_header)
 2168
 2169        # Config
 2170        config = self.get_config()
 2171
 2172        # Param
 2173        param = self.get_param()
 2174
 2175        # Tmp files to remove
 2176        tmp_to_remove = []
 2177
 2178        # If no output, get it
 2179        if not output_file:
 2180            output_file = self.get_output()
 2181
 2182        # If not threads
 2183        if not threads:
 2184            threads = self.get_threads()
 2185
 2186        # Rename fields
 2187        if not fields_to_rename:
 2188            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2189        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2190
 2191        # Auto header name with extension
 2192        if export_header or output_header:
 2193            if not output_header:
 2194                output_header = f"{output_file}.hdr"
 2195            # Export header
 2196            self.export_header(output_file=output_file)
 2197
 2198        # Switch off export header if VCF output
 2199        output_file_type = get_file_format(output_file)
 2200        if output_file_type in ["vcf"]:
 2201            export_header = False
 2202            tmp_to_remove.append(output_header)
 2203
 2204        # Chunk size
 2205        if not chunk_size:
 2206            chunk_size = config.get("chunk_size", None)
 2207
 2208        # Parquet partition
 2209        if not parquet_partitions:
 2210            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2211        if parquet_partitions and isinstance(parquet_partitions, str):
 2212            parquet_partitions = parquet_partitions.split(",")
 2213
 2214        # Order by
 2215        if not order_by:
 2216            order_by = param.get("export", {}).get("order_by", "")
 2217
 2218        # Header in output
 2219        header_in_output = param.get("export", {}).get("include_header", False)
 2220
 2221        # Database
 2222        database_source = self.get_connexion()
 2223
 2224        # Connexion format
 2225        connexion_format = self.get_connexion_format()
 2226
 2227        # Explode infos
 2228        if self.get_explode_infos():
 2229            self.explode_infos(
 2230                prefix=self.get_explode_infos_prefix(),
 2231                fields=self.get_explode_infos_fields(),
 2232                force=False,
 2233            )
 2234
 2235        # if connexion_format in ["sqlite"] or query:
 2236        if connexion_format in ["sqlite"]:
 2237
 2238            # Export in Parquet
 2239            random_tmp = "".join(
 2240                random.choice(string.ascii_lowercase) for i in range(10)
 2241            )
 2242            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2243            tmp_to_remove.append(database_source)
 2244
 2245            # Table Variants
 2246            table_variants = self.get_table_variants()
 2247
 2248            # Create export query
 2249            sql_query_export_subquery = f"""
 2250                SELECT * FROM {table_variants}
 2251                """
 2252
 2253            # Write source file
 2254            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2255
 2256        # Create database
 2257        database = Database(
 2258            database=database_source,
 2259            table="variants",
 2260            header_file=output_header,
 2261            conn_config=self.get_connexion_config(),
 2262        )
 2263
 2264        # Existing colomns header
 2265        existing_columns_header = database.get_header_columns_from_database(query=query)
 2266
 2267        # Sample list
 2268        if output_file_type in ["vcf"]:
 2269            get_samples = self.get_samples()
 2270            get_samples_check = self.get_samples_check()
 2271            samples_force = get_samples is not None
 2272            sample_list = self.get_header_sample_list(
 2273                check=get_samples_check,
 2274                samples=get_samples,
 2275                samples_force=samples_force,
 2276            )
 2277        else:
 2278            sample_list = None
 2279
 2280        # Export file
 2281        database.export(
 2282            output_database=output_file,
 2283            output_header=output_header,
 2284            existing_columns_header=existing_columns_header,
 2285            parquet_partitions=parquet_partitions,
 2286            chunk_size=chunk_size,
 2287            threads=threads,
 2288            sort=sort,
 2289            index=index,
 2290            header_in_output=header_in_output,
 2291            order_by=order_by,
 2292            query=query,
 2293            export_header=export_header,
 2294            sample_list=sample_list,
 2295        )
 2296
 2297        # Remove
 2298        remove_if_exists(tmp_to_remove)
 2299
 2300        return (os.path.exists(output_file) or None) and (
 2301            os.path.exists(output_file) or None
 2302        )
 2303
 2304    def get_extra_infos(self, table: str = None) -> list:
 2305        """
 2306        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2307        in the header.
 2308
 2309        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2310        name of the table from which you want to retrieve the extra columns that are not present in the
 2311        header. If the `table` parameter is not provided when calling the function, it will default to
 2312        using the variants
 2313        :type table: str
 2314        :return: A list of columns that are in the specified table but not in the header of the table.
 2315        """
 2316
 2317        header_columns = []
 2318
 2319        if not table:
 2320            table = self.get_table_variants(clause="from")
 2321            header_columns = self.get_header_columns()
 2322
 2323        # Check all columns in the database
 2324        query = f""" SELECT * FROM {table} LIMIT 1 """
 2325        log.debug(f"query {query}")
 2326        table_columns = self.get_query_to_df(query).columns.tolist()
 2327        extra_columns = []
 2328
 2329        # Construct extra infos (not in header)
 2330        for column in table_columns:
 2331            if column not in header_columns:
 2332                extra_columns.append(column)
 2333
 2334        return extra_columns
 2335
 2336    def get_extra_infos_sql(self, table: str = None) -> str:
 2337        """
 2338        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2339        by double quotes
 2340
 2341        :param table: The name of the table to get the extra infos from. If None, the default table is
 2342        used
 2343        :type table: str
 2344        :return: A string of the extra infos
 2345        """
 2346
 2347        return ", ".join(
 2348            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2349        )
 2350
 2351    def export_header(
 2352        self,
 2353        header_name: str = None,
 2354        output_file: str = None,
 2355        output_file_ext: str = ".hdr",
 2356        clean_header: bool = True,
 2357        remove_chrom_line: bool = False,
 2358    ) -> str:
 2359        """
 2360        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2361        specified options, and writes it to a new file.
 2362
 2363        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2364        this parameter is not specified, the header will be written to the output file
 2365        :type header_name: str
 2366        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2367        specify the name of the output file where the header will be written. If this parameter is not
 2368        provided, the header will be written to a temporary file
 2369        :type output_file: str
 2370        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2371        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2372        if not specified by the user. This extension will be appended to the `output_file` name to
 2373        create the final, defaults to .hdr
 2374        :type output_file_ext: str (optional)
 2375        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2376        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2377        `True`, the function will clean the header by modifying certain lines based on a specific
 2378        pattern. If `clean_header`, defaults to True
 2379        :type clean_header: bool (optional)
 2380        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2381        boolean flag that determines whether the #CHROM line should be removed from the header before
 2382        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2383        defaults to False
 2384        :type remove_chrom_line: bool (optional)
 2385        :return: The function `export_header` returns the name of the temporary header file that is
 2386        created.
 2387        """
 2388
 2389        if not header_name and not output_file:
 2390            output_file = self.get_output()
 2391
 2392        if self.get_header():
 2393
 2394            # Get header object
 2395            header_obj = self.get_header()
 2396
 2397            # Create database
 2398            db_for_header = Database(database=self.get_input())
 2399
 2400            # Get real columns in the file
 2401            db_header_columns = db_for_header.get_columns()
 2402
 2403            with tempfile.TemporaryDirectory() as tmpdir:
 2404
 2405                # Write header file
 2406                header_file_tmp = os.path.join(tmpdir, "header")
 2407                f = open(header_file_tmp, "w")
 2408                vcf.Writer(f, header_obj)
 2409                f.close()
 2410
 2411                # Replace #CHROM line with rel columns
 2412                header_list = db_for_header.read_header_file(
 2413                    header_file=header_file_tmp
 2414                )
 2415                header_list[-1] = "\t".join(db_header_columns)
 2416
 2417                # Remove CHROM line
 2418                if remove_chrom_line:
 2419                    header_list.pop()
 2420
 2421                # Clean header
 2422                if clean_header:
 2423                    header_list_clean = []
 2424                    for head in header_list:
 2425                        # Clean head for malformed header
 2426                        head_clean = head
 2427                        head_clean = re.subn(
 2428                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2429                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2430                            head_clean,
 2431                            2,
 2432                        )[0]
 2433                        # Write header
 2434                        header_list_clean.append(head_clean)
 2435                    header_list = header_list_clean
 2436
 2437            tmp_header_name = output_file + output_file_ext
 2438
 2439            f = open(tmp_header_name, "w")
 2440            for line in header_list:
 2441                f.write(line)
 2442            f.close()
 2443
 2444        return tmp_header_name
 2445
 2446    def export_variant_vcf(
 2447        self,
 2448        vcf_file,
 2449        remove_info: bool = False,
 2450        add_samples: bool = True,
 2451        list_samples: list = [],
 2452        where_clause: str = "",
 2453        index: bool = False,
 2454        threads: int | None = None,
 2455    ) -> bool | None:
 2456        """
 2457        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2458        remove INFO field, add samples, and control compression and indexing.
 2459
 2460        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2461        written to. It is the output file that will contain the filtered VCF data based on the specified
 2462        parameters
 2463        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2464        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2465        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2466        in, defaults to False
 2467        :type remove_info: bool (optional)
 2468        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2469        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2470        If set to False, the samples will be removed. The default value is True, defaults to True
 2471        :type add_samples: bool (optional)
 2472        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2473        in the output VCF file. By default, all samples will be included. If you provide a list of
 2474        samples, only those samples will be included in the output file
 2475        :type list_samples: list
 2476        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2477        determines whether or not to create an index for the output VCF file. If `index` is set to
 2478        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2479        :type index: bool (optional)
 2480        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2481        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2482        will be used during the export process. More threads can potentially speed up the export process
 2483        by utilizing multiple cores of the processor. If
 2484        :type threads: int | None
 2485        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2486        method with various parameters including the output file, query, threads, sort flag, and index
 2487        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2488        specified parameters and configurations provided in the `export_variant_vcf` function.
 2489        """
 2490
 2491        # Config
 2492        config = self.get_config()
 2493
 2494        # Extract VCF
 2495        log.debug("Export VCF...")
 2496
 2497        # Table variants
 2498        table_variants = self.get_table_variants()
 2499
 2500        # Threads
 2501        if not threads:
 2502            threads = self.get_threads()
 2503
 2504        # Info fields
 2505        if remove_info:
 2506            if not isinstance(remove_info, str):
 2507                remove_info = "."
 2508            info_field = f"""'{remove_info}' as INFO"""
 2509        else:
 2510            info_field = "INFO"
 2511
 2512        # Samples fields
 2513        if add_samples:
 2514            if not list_samples:
 2515                list_samples = self.get_header_sample_list()
 2516            if list_samples:
 2517                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2518            else:
 2519                samples_fields = ""
 2520            log.debug(f"samples_fields: {samples_fields}")
 2521        else:
 2522            samples_fields = ""
 2523
 2524        # Where clause
 2525        if where_clause is None:
 2526            where_clause = ""
 2527
 2528        # Variants
 2529        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2530        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2531        log.debug(f"sql_query_select={sql_query_select}")
 2532
 2533        return self.export_output(
 2534            output_file=vcf_file,
 2535            output_header=None,
 2536            export_header=True,
 2537            query=sql_query_select,
 2538            parquet_partitions=None,
 2539            chunk_size=config.get("chunk_size", None),
 2540            threads=threads,
 2541            sort=True,
 2542            index=index,
 2543            order_by=None,
 2544        )
 2545
 2546    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2547        """
 2548        It takes a list of commands and runs them in parallel using the number of threads specified
 2549
 2550        :param commands: A list of commands to run
 2551        :param threads: The number of threads to use, defaults to 1 (optional)
 2552        """
 2553
 2554        run_parallel_commands(commands, threads)
 2555
 2556    def get_threads(self, default: int = 1) -> int:
 2557        """
 2558        This function returns the number of threads to use for a job, with a default value of 1 if not
 2559        specified.
 2560
 2561        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2562        default number of threads to use if no specific value is provided. If no value is provided for
 2563        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2564        used, defaults to 1
 2565        :type default: int (optional)
 2566        :return: the number of threads to use for the current job.
 2567        """
 2568
 2569        # Config
 2570        config = self.get_config()
 2571
 2572        # Param
 2573        param = self.get_param()
 2574
 2575        # Input threads
 2576        input_thread = param.get("threads", config.get("threads", None))
 2577
 2578        # Check threads
 2579        if not input_thread:
 2580            threads = default
 2581        elif int(input_thread) <= 0:
 2582            threads = os.cpu_count()
 2583        else:
 2584            threads = int(input_thread)
 2585        return threads
 2586
 2587    def get_memory(self, default: str = None) -> str:
 2588        """
 2589        This function retrieves the memory value from parameters or configuration with a default value
 2590        if not found.
 2591
 2592        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2593        default value is used as a fallback in case the `memory` parameter is not provided in the
 2594        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2595        the function
 2596        :type default: str
 2597        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2598        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2599        return the default value provided as an argument to the function.
 2600        """
 2601
 2602        # Config
 2603        config = self.get_config()
 2604
 2605        # Param
 2606        param = self.get_param()
 2607
 2608        # Input threads
 2609        input_memory = param.get("memory", config.get("memory", None))
 2610
 2611        # Check threads
 2612        if input_memory:
 2613            memory = input_memory
 2614        else:
 2615            memory = default
 2616
 2617        return memory
 2618
 2619    def update_from_vcf(self, vcf_file: str) -> None:
 2620        """
 2621        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2622
 2623        :param vcf_file: the path to the VCF file
 2624        """
 2625
 2626        connexion_format = self.get_connexion_format()
 2627
 2628        if connexion_format in ["duckdb"]:
 2629            self.update_from_vcf_duckdb(vcf_file)
 2630        elif connexion_format in ["sqlite"]:
 2631            self.update_from_vcf_sqlite(vcf_file)
 2632
 2633    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2634        """
 2635        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2636        INFO column of the VCF file
 2637
 2638        :param vcf_file: the path to the VCF file
 2639        """
 2640
 2641        # varaints table
 2642        table_variants = self.get_table_variants()
 2643
 2644        # Loading VCF into temporaire table
 2645        skip = self.get_header_length(file=vcf_file)
 2646        vcf_df = pd.read_csv(
 2647            vcf_file,
 2648            sep="\t",
 2649            engine="c",
 2650            skiprows=skip,
 2651            header=0,
 2652            low_memory=False,
 2653        )
 2654        sql_query_update = f"""
 2655        UPDATE {table_variants} as table_variants
 2656            SET INFO = concat(
 2657                            CASE
 2658                                WHEN INFO NOT IN ('', '.')
 2659                                THEN INFO
 2660                                ELSE ''
 2661                            END,
 2662                            (
 2663                                SELECT 
 2664                                    concat(
 2665                                        CASE
 2666                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2667                                            THEN ';'
 2668                                            ELSE ''
 2669                                        END
 2670                                        ,
 2671                                        CASE
 2672                                            WHEN table_parquet.INFO NOT IN ('','.')
 2673                                            THEN table_parquet.INFO
 2674                                            ELSE ''
 2675                                        END
 2676                                    )
 2677                                FROM vcf_df as table_parquet
 2678                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2679                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2680                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2681                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2682                                        AND table_parquet.INFO NOT IN ('','.')
 2683                            )
 2684                        )
 2685            ;
 2686            """
 2687        self.conn.execute(sql_query_update)
 2688
 2689    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2690        """
 2691        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2692        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2693        table
 2694
 2695        :param vcf_file: The path to the VCF file you want to update the database with
 2696        """
 2697
 2698        # Create a temporary table for the VCF
 2699        table_vcf = "tmp_vcf"
 2700        sql_create = (
 2701            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2702        )
 2703        self.conn.execute(sql_create)
 2704
 2705        # Loading VCF into temporaire table
 2706        vcf_df = pd.read_csv(
 2707            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2708        )
 2709        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2710        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2711
 2712        # Update table 'variants' with VCF data
 2713        # warning: CONCAT as || operator
 2714        sql_query_update = f"""
 2715            UPDATE variants as table_variants
 2716            SET INFO = CASE
 2717                            WHEN INFO NOT IN ('', '.')
 2718                            THEN INFO
 2719                            ELSE ''
 2720                        END ||
 2721                        (
 2722                        SELECT 
 2723                            CASE 
 2724                                WHEN table_variants.INFO NOT IN ('','.') 
 2725                                    AND table_vcf.INFO NOT IN ('','.')  
 2726                                THEN ';' 
 2727                                ELSE '' 
 2728                            END || 
 2729                            CASE 
 2730                                WHEN table_vcf.INFO NOT IN ('','.') 
 2731                                THEN table_vcf.INFO 
 2732                                ELSE '' 
 2733                            END
 2734                        FROM {table_vcf} as table_vcf
 2735                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2736                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2737                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2738                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2739                        )
 2740        """
 2741        self.conn.execute(sql_query_update)
 2742
 2743        # Drop temporary table
 2744        sql_drop = f"DROP TABLE {table_vcf}"
 2745        self.conn.execute(sql_drop)
 2746
 2747    def drop_variants_table(self) -> None:
 2748        """
 2749        > This function drops the variants table
 2750        """
 2751
 2752        table_variants = self.get_table_variants()
 2753        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2754        self.conn.execute(sql_table_variants)
 2755
 2756    def set_variant_id(
 2757        self, variant_id_column: str = "variant_id", force: bool = None
 2758    ) -> str:
 2759        """
 2760        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2761        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2762
 2763        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2764        to variant_id
 2765        :type variant_id_column: str (optional)
 2766        :param force: If True, the variant_id column will be created even if it already exists
 2767        :type force: bool
 2768        :return: The name of the column that contains the variant_id
 2769        """
 2770
 2771        # Assembly
 2772        assembly = self.get_param().get(
 2773            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2774        )
 2775
 2776        # INFO/Tag prefix
 2777        prefix = self.get_explode_infos_prefix()
 2778
 2779        # Explode INFO/SVTYPE
 2780        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2781
 2782        # variants table
 2783        table_variants = self.get_table_variants()
 2784
 2785        # variant_id column
 2786        if not variant_id_column:
 2787            variant_id_column = "variant_id"
 2788
 2789        # Creta variant_id column
 2790        if "variant_id" not in self.get_extra_infos() or force:
 2791
 2792            # Create column
 2793            self.add_column(
 2794                table_name=table_variants,
 2795                column_name=variant_id_column,
 2796                column_type="UBIGINT",
 2797                default_value="0",
 2798            )
 2799
 2800            # Update column
 2801            self.conn.execute(
 2802                f"""
 2803                    UPDATE {table_variants}
 2804                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2805                """
 2806            )
 2807
 2808        # Remove added columns
 2809        for added_column in added_columns:
 2810            self.drop_column(column=added_column)
 2811
 2812        # return variant_id column name
 2813        return variant_id_column
 2814
 2815    def get_variant_id_column(
 2816        self, variant_id_column: str = "variant_id", force: bool = None
 2817    ) -> str:
 2818        """
 2819        This function returns the variant_id column name
 2820
 2821        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2822        defaults to variant_id
 2823        :type variant_id_column: str (optional)
 2824        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2825        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2826        if it is not already set, or if it is set
 2827        :type force: bool
 2828        :return: The variant_id column name.
 2829        """
 2830
 2831        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2832
 2833    ###
 2834    # Annotation
 2835    ###
 2836
 2837    def scan_databases(
 2838        self,
 2839        database_formats: list = ["parquet"],
 2840        database_releases: list = ["current"],
 2841    ) -> dict:
 2842        """
 2843        The function `scan_databases` scans for available databases based on specified formats and
 2844        releases.
 2845
 2846        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2847        of the databases to be scanned. In this case, the accepted format is "parquet"
 2848        :type database_formats: list ["parquet"]
 2849        :param database_releases: The `database_releases` parameter is a list that specifies the
 2850        releases of the databases to be scanned. In the provided function, the default value for
 2851        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2852        databases that are in the "current"
 2853        :type database_releases: list
 2854        :return: The function `scan_databases` returns a dictionary containing information about
 2855        databases that match the specified formats and releases.
 2856        """
 2857
 2858        # Config
 2859        config = self.get_config()
 2860
 2861        # Param
 2862        param = self.get_param()
 2863
 2864        # Param - Assembly
 2865        assembly = param.get("assembly", config.get("assembly", None))
 2866        if not assembly:
 2867            assembly = DEFAULT_ASSEMBLY
 2868            log.warning(f"Default assembly '{assembly}'")
 2869
 2870        # Scan for availabled databases
 2871        log.info(
 2872            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2873        )
 2874        databases_infos_dict = databases_infos(
 2875            database_folder_releases=database_releases,
 2876            database_formats=database_formats,
 2877            assembly=assembly,
 2878            config=config,
 2879        )
 2880        log.info(
 2881            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2882        )
 2883
 2884        return databases_infos_dict
 2885
 2886    def annotation(self) -> None:
 2887        """
 2888        It annotates the VCF file with the annotations specified in the config file.
 2889        """
 2890
 2891        # Config
 2892        config = self.get_config()
 2893
 2894        # Param
 2895        param = self.get_param()
 2896
 2897        # Param - Assembly
 2898        assembly = param.get("assembly", config.get("assembly", None))
 2899        if not assembly:
 2900            assembly = DEFAULT_ASSEMBLY
 2901            log.warning(f"Default assembly '{assembly}'")
 2902
 2903        # annotations databases folders
 2904        annotations_databases = set(
 2905            config.get("folders", {})
 2906            .get("databases", {})
 2907            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2908            + config.get("folders", {})
 2909            .get("databases", {})
 2910            .get("parquet", ["~/howard/databases/parquet/current"])
 2911            + config.get("folders", {})
 2912            .get("databases", {})
 2913            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2914        )
 2915
 2916        # Get param annotations
 2917        if param.get("annotations", None) and isinstance(
 2918            param.get("annotations", None), str
 2919        ):
 2920            log.debug(param.get("annotations", None))
 2921            param_annotation_list = param.get("annotations").split(",")
 2922        else:
 2923            param_annotation_list = []
 2924
 2925        # Each tools param
 2926        if param.get("annotation_parquet", None) != None:
 2927            log.debug(
 2928                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2929            )
 2930            if isinstance(param.get("annotation_parquet", None), list):
 2931                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2932            else:
 2933                param_annotation_list.append(param.get("annotation_parquet"))
 2934        if param.get("annotation_snpsift", None) != None:
 2935            if isinstance(param.get("annotation_snpsift", None), list):
 2936                param_annotation_list.append(
 2937                    "snpsift:"
 2938                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2939                )
 2940            else:
 2941                param_annotation_list.append(
 2942                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2943                )
 2944        if param.get("annotation_snpeff", None) != None:
 2945            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2946        if param.get("annotation_bcftools", None) != None:
 2947            if isinstance(param.get("annotation_bcftools", None), list):
 2948                param_annotation_list.append(
 2949                    "bcftools:"
 2950                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2951                )
 2952            else:
 2953                param_annotation_list.append(
 2954                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2955                )
 2956        if param.get("annotation_annovar", None) != None:
 2957            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2958        if param.get("annotation_exomiser", None) != None:
 2959            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2960        if param.get("annotation_splice", None) != None:
 2961            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2962
 2963        # Merge param annotations list
 2964        param["annotations"] = ",".join(param_annotation_list)
 2965
 2966        # debug
 2967        log.debug(f"param_annotations={param['annotations']}")
 2968
 2969        if param.get("annotations"):
 2970
 2971            # Log
 2972            # log.info("Annotations - Check annotation parameters")
 2973
 2974            if not "annotation" in param:
 2975                param["annotation"] = {}
 2976
 2977            # List of annotations parameters
 2978            annotations_list_input = {}
 2979            if isinstance(param.get("annotations", None), str):
 2980                annotation_file_list = [
 2981                    value for value in param.get("annotations", "").split(",")
 2982                ]
 2983                for annotation_file in annotation_file_list:
 2984                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2985            else:
 2986                annotations_list_input = param.get("annotations", {})
 2987
 2988            log.info(f"Quick Annotations:")
 2989            for annotation_key in list(annotations_list_input.keys()):
 2990                log.info(f"   {annotation_key}")
 2991
 2992            # List of annotations and associated fields
 2993            annotations_list = {}
 2994
 2995            for annotation_file in annotations_list_input:
 2996
 2997                # Explode annotations if ALL
 2998                if (
 2999                    annotation_file.upper() == "ALL"
 3000                    or annotation_file.upper().startswith("ALL:")
 3001                ):
 3002
 3003                    # check ALL parameters (formats, releases)
 3004                    annotation_file_split = annotation_file.split(":")
 3005                    database_formats = "parquet"
 3006                    database_releases = "current"
 3007                    for annotation_file_option in annotation_file_split[1:]:
 3008                        database_all_options_split = annotation_file_option.split("=")
 3009                        if database_all_options_split[0] == "format":
 3010                            database_formats = database_all_options_split[1].split("+")
 3011                        if database_all_options_split[0] == "release":
 3012                            database_releases = database_all_options_split[1].split("+")
 3013
 3014                    # Scan for availabled databases
 3015                    databases_infos_dict = self.scan_databases(
 3016                        database_formats=database_formats,
 3017                        database_releases=database_releases,
 3018                    )
 3019
 3020                    # Add found databases in annotation parameters
 3021                    for database_infos in databases_infos_dict.keys():
 3022                        annotations_list[database_infos] = {"INFO": None}
 3023
 3024                else:
 3025                    annotations_list[annotation_file] = annotations_list_input[
 3026                        annotation_file
 3027                    ]
 3028
 3029            # Check each databases
 3030            if len(annotations_list):
 3031
 3032                log.info(
 3033                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3034                )
 3035
 3036                for annotation_file in annotations_list:
 3037
 3038                    # Init
 3039                    annotations = annotations_list.get(annotation_file, None)
 3040
 3041                    # Annotation snpEff
 3042                    if annotation_file.startswith("snpeff"):
 3043
 3044                        log.debug(f"Quick Annotation snpEff")
 3045
 3046                        if "snpeff" not in param["annotation"]:
 3047                            param["annotation"]["snpeff"] = {}
 3048
 3049                        if "options" not in param["annotation"]["snpeff"]:
 3050                            param["annotation"]["snpeff"]["options"] = ""
 3051
 3052                        # snpEff options in annotations
 3053                        param["annotation"]["snpeff"]["options"] = "".join(
 3054                            annotation_file.split(":")[1:]
 3055                        )
 3056
 3057                    # Annotation Annovar
 3058                    elif annotation_file.startswith("annovar"):
 3059
 3060                        log.debug(f"Quick Annotation Annovar")
 3061
 3062                        if "annovar" not in param["annotation"]:
 3063                            param["annotation"]["annovar"] = {}
 3064
 3065                        if "annotations" not in param["annotation"]["annovar"]:
 3066                            param["annotation"]["annovar"]["annotations"] = {}
 3067
 3068                        # Options
 3069                        annotation_file_split = annotation_file.split(":")
 3070                        for annotation_file_annotation in annotation_file_split[1:]:
 3071                            if annotation_file_annotation:
 3072                                param["annotation"]["annovar"]["annotations"][
 3073                                    annotation_file_annotation
 3074                                ] = annotations
 3075
 3076                    # Annotation Exomiser
 3077                    elif annotation_file.startswith("exomiser"):
 3078
 3079                        log.debug(f"Quick Annotation Exomiser")
 3080
 3081                        param["annotation"]["exomiser"] = params_string_to_dict(
 3082                            annotation_file
 3083                        )
 3084
 3085                    # Annotation Splice
 3086                    elif annotation_file.startswith("splice"):
 3087
 3088                        log.debug(f"Quick Annotation Splice")
 3089
 3090                        param["annotation"]["splice"] = params_string_to_dict(
 3091                            annotation_file
 3092                        )
 3093
 3094                    # Annotation Parquet or BCFTOOLS
 3095                    else:
 3096
 3097                        # Tools detection
 3098                        if annotation_file.startswith("bcftools:"):
 3099                            annotation_tool_initial = "bcftools"
 3100                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3101                        elif annotation_file.startswith("snpsift:"):
 3102                            annotation_tool_initial = "snpsift"
 3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3104                        elif annotation_file.startswith("bigwig:"):
 3105                            annotation_tool_initial = "bigwig"
 3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3107                        else:
 3108                            annotation_tool_initial = None
 3109
 3110                        # list of files
 3111                        annotation_file_list = annotation_file.replace("+", ":").split(
 3112                            ":"
 3113                        )
 3114
 3115                        for annotation_file in annotation_file_list:
 3116
 3117                            if annotation_file:
 3118
 3119                                # Annotation tool initial
 3120                                annotation_tool = annotation_tool_initial
 3121
 3122                                # Find file
 3123                                annotation_file_found = None
 3124
 3125                                if os.path.exists(annotation_file):
 3126                                    annotation_file_found = annotation_file
 3127                                elif os.path.exists(full_path(annotation_file)):
 3128                                    annotation_file_found = full_path(annotation_file)
 3129                                else:
 3130                                    # Find within assembly folders
 3131                                    for annotations_database in annotations_databases:
 3132                                        found_files = find_all(
 3133                                            annotation_file,
 3134                                            os.path.join(
 3135                                                annotations_database, assembly
 3136                                            ),
 3137                                        )
 3138                                        if len(found_files) > 0:
 3139                                            annotation_file_found = found_files[0]
 3140                                            break
 3141                                    if not annotation_file_found and not assembly:
 3142                                        # Find within folders
 3143                                        for (
 3144                                            annotations_database
 3145                                        ) in annotations_databases:
 3146                                            found_files = find_all(
 3147                                                annotation_file, annotations_database
 3148                                            )
 3149                                            if len(found_files) > 0:
 3150                                                annotation_file_found = found_files[0]
 3151                                                break
 3152                                log.debug(
 3153                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3154                                )
 3155
 3156                                # Full path
 3157                                annotation_file_found = full_path(annotation_file_found)
 3158
 3159                                if annotation_file_found:
 3160
 3161                                    database = Database(database=annotation_file_found)
 3162                                    quick_annotation_format = database.get_format()
 3163                                    quick_annotation_is_compressed = (
 3164                                        database.is_compressed()
 3165                                    )
 3166                                    quick_annotation_is_indexed = os.path.exists(
 3167                                        f"{annotation_file_found}.tbi"
 3168                                    )
 3169                                    bcftools_preference = False
 3170
 3171                                    # Check Annotation Tool
 3172                                    if not annotation_tool:
 3173                                        if (
 3174                                            bcftools_preference
 3175                                            and quick_annotation_format
 3176                                            in ["vcf", "bed"]
 3177                                            and quick_annotation_is_compressed
 3178                                            and quick_annotation_is_indexed
 3179                                        ):
 3180                                            annotation_tool = "bcftools"
 3181                                        elif quick_annotation_format in [
 3182                                            "vcf",
 3183                                            "bed",
 3184                                            "tsv",
 3185                                            "tsv",
 3186                                            "csv",
 3187                                            "json",
 3188                                            "tbl",
 3189                                            "parquet",
 3190                                            "duckdb",
 3191                                        ]:
 3192                                            annotation_tool = "parquet"
 3193                                        elif quick_annotation_format in ["bw"]:
 3194                                            annotation_tool = "bigwig"
 3195                                        else:
 3196                                            log.error(
 3197                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3198                                            )
 3199                                            raise ValueError(
 3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3201                                            )
 3202
 3203                                    log.debug(
 3204                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3205                                    )
 3206
 3207                                    # Annotation Tool dispatch
 3208                                    if annotation_tool:
 3209                                        if annotation_tool not in param["annotation"]:
 3210                                            param["annotation"][annotation_tool] = {}
 3211                                        if (
 3212                                            "annotations"
 3213                                            not in param["annotation"][annotation_tool]
 3214                                        ):
 3215                                            param["annotation"][annotation_tool][
 3216                                                "annotations"
 3217                                            ] = {}
 3218                                        param["annotation"][annotation_tool][
 3219                                            "annotations"
 3220                                        ][annotation_file_found] = annotations
 3221
 3222                                else:
 3223                                    log.warning(
 3224                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3225                                    )
 3226
 3227                self.set_param(param)
 3228
 3229        if param.get("annotation", None):
 3230            log.info("Annotations")
 3231            if param.get("annotation", {}).get("parquet", None):
 3232                log.info("Annotations 'parquet'...")
 3233                self.annotation_parquet()
 3234            if param.get("annotation", {}).get("bcftools", None):
 3235                log.info("Annotations 'bcftools'...")
 3236                self.annotation_bcftools()
 3237            if param.get("annotation", {}).get("snpsift", None):
 3238                log.info("Annotations 'snpsift'...")
 3239                self.annotation_snpsift()
 3240            if param.get("annotation", {}).get("bigwig", None):
 3241                log.info("Annotations 'bigwig'...")
 3242                self.annotation_bigwig()
 3243            if param.get("annotation", {}).get("annovar", None):
 3244                log.info("Annotations 'annovar'...")
 3245                self.annotation_annovar()
 3246            if param.get("annotation", {}).get("snpeff", None):
 3247                log.info("Annotations 'snpeff'...")
 3248                self.annotation_snpeff()
 3249            if param.get("annotation", {}).get("exomiser", None) is not None:
 3250                log.info("Annotations 'exomiser'...")
 3251                self.annotation_exomiser()
 3252            if param.get("annotation", {}).get("splice", None) is not None:
 3253                log.info("Annotations 'splice' ...")
 3254                self.annotation_splice()
 3255
 3256        # Explode INFOS fields into table fields
 3257        if self.get_explode_infos():
 3258            self.explode_infos(
 3259                prefix=self.get_explode_infos_prefix(),
 3260                fields=self.get_explode_infos_fields(),
 3261                force=True,
 3262            )
 3263
 3264    def annotation_bigwig(self, threads: int = None) -> None:
 3265        """
 3266        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3267
 3268        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3269        number of threads to be used for parallel processing during the annotation process. If the
 3270        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3271        threads to use based on the system configuration
 3272        :type threads: int
 3273        :return: True
 3274        """
 3275
 3276        # DEBUG
 3277        log.debug("Start annotation with bigwig databases")
 3278
 3279        # # Threads
 3280        # if not threads:
 3281        #     threads = self.get_threads()
 3282        # log.debug("Threads: " + str(threads))
 3283
 3284        # Config
 3285        config = self.get_config()
 3286        log.debug("Config: " + str(config))
 3287
 3288        # Config - BCFTools databases folders
 3289        databases_folders = set(
 3290            self.get_config()
 3291            .get("folders", {})
 3292            .get("databases", {})
 3293            .get("annotations", ["."])
 3294            + self.get_config()
 3295            .get("folders", {})
 3296            .get("databases", {})
 3297            .get("bigwig", ["."])
 3298        )
 3299        log.debug("Databases annotations: " + str(databases_folders))
 3300
 3301        # Param
 3302        annotations = (
 3303            self.get_param()
 3304            .get("annotation", {})
 3305            .get("bigwig", {})
 3306            .get("annotations", None)
 3307        )
 3308        log.debug("Annotations: " + str(annotations))
 3309
 3310        # Assembly
 3311        assembly = self.get_param().get(
 3312            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3313        )
 3314
 3315        # Data
 3316        table_variants = self.get_table_variants()
 3317
 3318        # Check if not empty
 3319        log.debug("Check if not empty")
 3320        sql_query_chromosomes = (
 3321            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3322        )
 3323        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3324        if not sql_query_chromosomes_df["count"][0]:
 3325            log.info(f"VCF empty")
 3326            return
 3327
 3328        # VCF header
 3329        vcf_reader = self.get_header()
 3330        log.debug("Initial header: " + str(vcf_reader.infos))
 3331
 3332        # Existing annotations
 3333        for vcf_annotation in self.get_header().infos:
 3334
 3335            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3336            log.debug(
 3337                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3338            )
 3339
 3340        if annotations:
 3341
 3342            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3343
 3344                # Export VCF file
 3345                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3346
 3347                # annotation_bigwig_config
 3348                annotation_bigwig_config_list = []
 3349
 3350                for annotation in annotations:
 3351                    annotation_fields = annotations[annotation]
 3352
 3353                    # Annotation Name
 3354                    annotation_name = os.path.basename(annotation)
 3355
 3356                    if not annotation_fields:
 3357                        annotation_fields = {"INFO": None}
 3358
 3359                    log.debug(f"Annotation '{annotation_name}'")
 3360                    log.debug(
 3361                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3362                    )
 3363
 3364                    # Create Database
 3365                    database = Database(
 3366                        database=annotation,
 3367                        databases_folders=databases_folders,
 3368                        assembly=assembly,
 3369                    )
 3370
 3371                    # Find files
 3372                    db_file = database.get_database()
 3373                    db_file = full_path(db_file)
 3374                    db_hdr_file = database.get_header_file()
 3375                    db_hdr_file = full_path(db_hdr_file)
 3376                    db_file_type = database.get_format()
 3377
 3378                    # If db_file is http ?
 3379                    if database.get_database().startswith("http"):
 3380
 3381                        # Datbase is HTTP URL
 3382                        db_file_is_http = True
 3383
 3384                        # DB file keep as URL
 3385                        db_file = database.get_database()
 3386                        log.warning(
 3387                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3388                        )
 3389
 3390                        # Retrieve automatic annotation field name
 3391                        annotation_field = clean_annotation_field(
 3392                            os.path.basename(db_file).replace(".bw", "")
 3393                        )
 3394                        log.debug(
 3395                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3396                        )
 3397
 3398                        # Create automatic header file
 3399                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3400                        with open(db_hdr_file, "w") as f:
 3401                            f.write("##fileformat=VCFv4.2\n")
 3402                            f.write(
 3403                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3404                            )
 3405                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3406
 3407                    else:
 3408
 3409                        # Datbase is NOT HTTP URL
 3410                        db_file_is_http = False
 3411
 3412                    # Check index - try to create if not exists
 3413                    if (
 3414                        db_file is None
 3415                        or db_hdr_file is None
 3416                        or (not os.path.exists(db_file) and not db_file_is_http)
 3417                        or not os.path.exists(db_hdr_file)
 3418                        or not db_file_type in ["bw"]
 3419                    ):
 3420                        # if False:
 3421                        log.error("Annotation failed: database not valid")
 3422                        log.error(f"Annotation annotation file: {db_file}")
 3423                        log.error(f"Annotation annotation file type: {db_file_type}")
 3424                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3425                        raise ValueError(
 3426                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3427                        )
 3428                    else:
 3429
 3430                        # Log
 3431                        log.debug(
 3432                            f"Annotation '{annotation}' - file: "
 3433                            + str(db_file)
 3434                            + " and "
 3435                            + str(db_hdr_file)
 3436                        )
 3437
 3438                        # Load header as VCF object
 3439                        db_hdr_vcf = Variants(input=db_hdr_file)
 3440                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3441                        log.debug(
 3442                            "Annotation database header: "
 3443                            + str(db_hdr_vcf_header_infos)
 3444                        )
 3445
 3446                        # For all fields in database
 3447                        annotation_fields_full = False
 3448                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3449                            annotation_fields = {
 3450                                key: key for key in db_hdr_vcf_header_infos
 3451                            }
 3452                            log.debug(
 3453                                "Annotation database header - All annotations added: "
 3454                                + str(annotation_fields)
 3455                            )
 3456                            annotation_fields_full = True
 3457
 3458                        # Init
 3459                        cyvcf2_header_rename_dict = {}
 3460                        cyvcf2_header_list = []
 3461                        cyvcf2_header_indexes = {}
 3462
 3463                        # process annotation fields
 3464                        for annotation_field in annotation_fields:
 3465
 3466                            # New annotation name
 3467                            annotation_field_new = annotation_fields[annotation_field]
 3468
 3469                            # Check annotation field and index in header
 3470                            if (
 3471                                annotation_field
 3472                                in db_hdr_vcf.get_header_columns_as_list()
 3473                            ):
 3474                                annotation_field_index = (
 3475                                    db_hdr_vcf.get_header_columns_as_list().index(
 3476                                        annotation_field
 3477                                    )
 3478                                    - 3
 3479                                )
 3480                                cyvcf2_header_indexes[annotation_field_new] = (
 3481                                    annotation_field_index
 3482                                )
 3483                            else:
 3484                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3485                                log.error(msg_err)
 3486                                raise ValueError(msg_err)
 3487
 3488                            # Append annotation field in cyvcf2 header list
 3489                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3490                                db_hdr_vcf_header_infos[annotation_field].id
 3491                            )
 3492                            cyvcf2_header_list.append(
 3493                                {
 3494                                    "ID": annotation_field_new,
 3495                                    "Number": db_hdr_vcf_header_infos[
 3496                                        annotation_field
 3497                                    ].num,
 3498                                    "Type": db_hdr_vcf_header_infos[
 3499                                        annotation_field
 3500                                    ].type,
 3501                                    "Description": db_hdr_vcf_header_infos[
 3502                                        annotation_field
 3503                                    ].desc,
 3504                                }
 3505                            )
 3506
 3507                            # Add header on VCF
 3508                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3509                                annotation_field_new,
 3510                                db_hdr_vcf_header_infos[annotation_field].num,
 3511                                db_hdr_vcf_header_infos[annotation_field].type,
 3512                                db_hdr_vcf_header_infos[annotation_field].desc,
 3513                                "HOWARD BigWig annotation",
 3514                                "unknown",
 3515                                self.code_type_map[
 3516                                    db_hdr_vcf_header_infos[annotation_field].type
 3517                                ],
 3518                            )
 3519
 3520                        # Load bigwig database
 3521                        bw_db = pyBigWig.open(db_file)
 3522                        if bw_db.isBigWig():
 3523                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3524                        else:
 3525                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3526                            log.error(msg_err)
 3527                            raise ValueError(msg_err)
 3528
 3529                        annotation_bigwig_config_list.append(
 3530                            {
 3531                                "db_file": db_file,
 3532                                "bw_db": bw_db,
 3533                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3534                                "cyvcf2_header_list": cyvcf2_header_list,
 3535                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3536                            }
 3537                        )
 3538
 3539                # Annotate
 3540                if annotation_bigwig_config_list:
 3541
 3542                    # Annotation config
 3543                    log.debug(
 3544                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3545                    )
 3546
 3547                    # Export VCF file
 3548                    self.export_variant_vcf(
 3549                        vcf_file=tmp_vcf_name,
 3550                        remove_info=True,
 3551                        add_samples=False,
 3552                        index=True,
 3553                    )
 3554
 3555                    # Load input tmp file
 3556                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3557
 3558                    # Add header in input file
 3559                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3560                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3561                            "cyvcf2_header_list", []
 3562                        ):
 3563                            log.info(
 3564                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3565                            )
 3566                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3567
 3568                    # Create output VCF file
 3569                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3570                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3571
 3572                    # Fetch variants
 3573                    log.info(f"Annotations 'bigwig' start...")
 3574                    for variant in input_vcf:
 3575
 3576                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3577
 3578                            # DB and indexes
 3579                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3580                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3581                                "cyvcf2_header_indexes", None
 3582                            )
 3583
 3584                            # Retrieve value from chrom pos
 3585                            res = bw_db.values(
 3586                                variant.CHROM, variant.POS - 1, variant.POS
 3587                            )
 3588
 3589                            # For each annotation fields (and indexes)
 3590                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3591
 3592                                # If value is NOT nNone
 3593                                if not np.isnan(
 3594                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3595                                ):
 3596                                    variant.INFO[cyvcf2_header_index] = res[
 3597                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3598                                    ]
 3599
 3600                        # Add record in output file
 3601                        output_vcf.write_record(variant)
 3602
 3603                    # Log
 3604                    log.debug(f"Annotation done.")
 3605
 3606                    # Close and write file
 3607                    log.info(f"Annotations 'bigwig' write...")
 3608                    output_vcf.close()
 3609                    log.debug(f"Write done.")
 3610
 3611                    # Update variants
 3612                    log.info(f"Annotations 'bigwig' update...")
 3613                    self.update_from_vcf(output_vcf_file)
 3614                    log.debug(f"Update done.")
 3615
 3616        return True
 3617
 3618    def annotation_snpsift(self, threads: int = None) -> None:
 3619        """
 3620        This function annotate with bcftools
 3621
 3622        :param threads: Number of threads to use
 3623        :return: the value of the variable "return_value".
 3624        """
 3625
 3626        # DEBUG
 3627        log.debug("Start annotation with bcftools databases")
 3628
 3629        # Threads
 3630        if not threads:
 3631            threads = self.get_threads()
 3632        log.debug("Threads: " + str(threads))
 3633
 3634        # Config
 3635        config = self.get_config()
 3636        log.debug("Config: " + str(config))
 3637
 3638        # Config - snpSift
 3639        snpsift_bin_command = get_bin_command(
 3640            bin="SnpSift.jar",
 3641            tool="snpsift",
 3642            bin_type="jar",
 3643            config=config,
 3644            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3645        )
 3646        if not snpsift_bin_command:
 3647            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3648            log.error(msg_err)
 3649            raise ValueError(msg_err)
 3650
 3651        # Config - bcftools
 3652        bcftools_bin_command = get_bin_command(
 3653            bin="bcftools",
 3654            tool="bcftools",
 3655            bin_type="bin",
 3656            config=config,
 3657            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3658        )
 3659        if not bcftools_bin_command:
 3660            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3661            log.error(msg_err)
 3662            raise ValueError(msg_err)
 3663
 3664        # Config - BCFTools databases folders
 3665        databases_folders = set(
 3666            self.get_config()
 3667            .get("folders", {})
 3668            .get("databases", {})
 3669            .get("annotations", ["."])
 3670            + self.get_config()
 3671            .get("folders", {})
 3672            .get("databases", {})
 3673            .get("bcftools", ["."])
 3674        )
 3675        log.debug("Databases annotations: " + str(databases_folders))
 3676
 3677        # Param
 3678        annotations = (
 3679            self.get_param()
 3680            .get("annotation", {})
 3681            .get("snpsift", {})
 3682            .get("annotations", None)
 3683        )
 3684        log.debug("Annotations: " + str(annotations))
 3685
 3686        # Assembly
 3687        assembly = self.get_param().get(
 3688            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3689        )
 3690
 3691        # Data
 3692        table_variants = self.get_table_variants()
 3693
 3694        # Check if not empty
 3695        log.debug("Check if not empty")
 3696        sql_query_chromosomes = (
 3697            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3698        )
 3699        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3700        if not sql_query_chromosomes_df["count"][0]:
 3701            log.info(f"VCF empty")
 3702            return
 3703
 3704        # VCF header
 3705        vcf_reader = self.get_header()
 3706        log.debug("Initial header: " + str(vcf_reader.infos))
 3707
 3708        # Existing annotations
 3709        for vcf_annotation in self.get_header().infos:
 3710
 3711            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3712            log.debug(
 3713                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3714            )
 3715
 3716        if annotations:
 3717
 3718            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3719
 3720                # Export VCF file
 3721                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3722
 3723                # Init
 3724                commands = {}
 3725
 3726                for annotation in annotations:
 3727                    annotation_fields = annotations[annotation]
 3728
 3729                    # Annotation Name
 3730                    annotation_name = os.path.basename(annotation)
 3731
 3732                    if not annotation_fields:
 3733                        annotation_fields = {"INFO": None}
 3734
 3735                    log.debug(f"Annotation '{annotation_name}'")
 3736                    log.debug(
 3737                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3738                    )
 3739
 3740                    # Create Database
 3741                    database = Database(
 3742                        database=annotation,
 3743                        databases_folders=databases_folders,
 3744                        assembly=assembly,
 3745                    )
 3746
 3747                    # Find files
 3748                    db_file = database.get_database()
 3749                    db_file = full_path(db_file)
 3750                    db_hdr_file = database.get_header_file()
 3751                    db_hdr_file = full_path(db_hdr_file)
 3752                    db_file_type = database.get_format()
 3753                    db_tbi_file = f"{db_file}.tbi"
 3754                    db_file_compressed = database.is_compressed()
 3755
 3756                    # Check if compressed
 3757                    if not db_file_compressed:
 3758                        log.error(
 3759                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3760                        )
 3761                        raise ValueError(
 3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3763                        )
 3764
 3765                    # Check if indexed
 3766                    if not os.path.exists(db_tbi_file):
 3767                        log.error(
 3768                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3769                        )
 3770                        raise ValueError(
 3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3772                        )
 3773
 3774                    # Check index - try to create if not exists
 3775                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3776                        log.error("Annotation failed: database not valid")
 3777                        log.error(f"Annotation annotation file: {db_file}")
 3778                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3779                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3780                        raise ValueError(
 3781                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3782                        )
 3783                    else:
 3784
 3785                        log.debug(
 3786                            f"Annotation '{annotation}' - file: "
 3787                            + str(db_file)
 3788                            + " and "
 3789                            + str(db_hdr_file)
 3790                        )
 3791
 3792                        # Load header as VCF object
 3793                        db_hdr_vcf = Variants(input=db_hdr_file)
 3794                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3795                        log.debug(
 3796                            "Annotation database header: "
 3797                            + str(db_hdr_vcf_header_infos)
 3798                        )
 3799
 3800                        # For all fields in database
 3801                        annotation_fields_full = False
 3802                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3803                            annotation_fields = {
 3804                                key: key for key in db_hdr_vcf_header_infos
 3805                            }
 3806                            log.debug(
 3807                                "Annotation database header - All annotations added: "
 3808                                + str(annotation_fields)
 3809                            )
 3810                            annotation_fields_full = True
 3811
 3812                        # # Create file for field rename
 3813                        # log.debug("Create file for field rename")
 3814                        # tmp_rename = NamedTemporaryFile(
 3815                        #     prefix=self.get_prefix(),
 3816                        #     dir=self.get_tmp_dir(),
 3817                        #     suffix=".rename",
 3818                        #     delete=False,
 3819                        # )
 3820                        # tmp_rename_name = tmp_rename.name
 3821                        # tmp_files.append(tmp_rename_name)
 3822
 3823                        # Number of fields
 3824                        nb_annotation_field = 0
 3825                        annotation_list = []
 3826                        annotation_infos_rename_list = []
 3827
 3828                        for annotation_field in annotation_fields:
 3829
 3830                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3831                            annotation_fields_new_name = annotation_fields.get(
 3832                                annotation_field, annotation_field
 3833                            )
 3834                            if not annotation_fields_new_name:
 3835                                annotation_fields_new_name = annotation_field
 3836
 3837                            # Check if field is in DB and if field is not elready in input data
 3838                            if (
 3839                                annotation_field in db_hdr_vcf.get_header().infos
 3840                                and annotation_fields_new_name
 3841                                not in self.get_header().infos
 3842                            ):
 3843
 3844                                log.info(
 3845                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3846                                )
 3847
 3848                                # BCFTools annotate param to rename fields
 3849                                if annotation_field != annotation_fields_new_name:
 3850                                    annotation_infos_rename_list.append(
 3851                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3852                                    )
 3853
 3854                                # Add INFO field to header
 3855                                db_hdr_vcf_header_infos_number = (
 3856                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3857                                )
 3858                                db_hdr_vcf_header_infos_type = (
 3859                                    db_hdr_vcf_header_infos[annotation_field].type
 3860                                    or "String"
 3861                                )
 3862                                db_hdr_vcf_header_infos_description = (
 3863                                    db_hdr_vcf_header_infos[annotation_field].desc
 3864                                    or f"{annotation_field} description"
 3865                                )
 3866                                db_hdr_vcf_header_infos_source = (
 3867                                    db_hdr_vcf_header_infos[annotation_field].source
 3868                                    or "unknown"
 3869                                )
 3870                                db_hdr_vcf_header_infos_version = (
 3871                                    db_hdr_vcf_header_infos[annotation_field].version
 3872                                    or "unknown"
 3873                                )
 3874
 3875                                vcf_reader.infos[annotation_fields_new_name] = (
 3876                                    vcf.parser._Info(
 3877                                        annotation_fields_new_name,
 3878                                        db_hdr_vcf_header_infos_number,
 3879                                        db_hdr_vcf_header_infos_type,
 3880                                        db_hdr_vcf_header_infos_description,
 3881                                        db_hdr_vcf_header_infos_source,
 3882                                        db_hdr_vcf_header_infos_version,
 3883                                        self.code_type_map[
 3884                                            db_hdr_vcf_header_infos_type
 3885                                        ],
 3886                                    )
 3887                                )
 3888
 3889                                annotation_list.append(annotation_field)
 3890
 3891                                nb_annotation_field += 1
 3892
 3893                            else:
 3894
 3895                                if (
 3896                                    annotation_field
 3897                                    not in db_hdr_vcf.get_header().infos
 3898                                ):
 3899                                    log.warning(
 3900                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3901                                    )
 3902                                if (
 3903                                    annotation_fields_new_name
 3904                                    in self.get_header().infos
 3905                                ):
 3906                                    log.warning(
 3907                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3908                                    )
 3909
 3910                        log.info(
 3911                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3912                        )
 3913
 3914                        annotation_infos = ",".join(annotation_list)
 3915
 3916                        if annotation_infos != "":
 3917
 3918                            # Annotated VCF (and error file)
 3919                            tmp_annotation_vcf_name = os.path.join(
 3920                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3921                            )
 3922                            tmp_annotation_vcf_name_err = (
 3923                                tmp_annotation_vcf_name + ".err"
 3924                            )
 3925
 3926                            # Add fields to annotate
 3927                            if not annotation_fields_full:
 3928                                annotation_infos_option = f"-info {annotation_infos}"
 3929                            else:
 3930                                annotation_infos_option = ""
 3931
 3932                            # Info fields rename
 3933                            if annotation_infos_rename_list:
 3934                                annotation_infos_rename = " -c " + ",".join(
 3935                                    annotation_infos_rename_list
 3936                                )
 3937                            else:
 3938                                annotation_infos_rename = ""
 3939
 3940                            # Annotate command
 3941                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3942
 3943                            # Add command
 3944                            commands[command_annotate] = tmp_annotation_vcf_name
 3945
 3946                if commands:
 3947
 3948                    # Export VCF file
 3949                    self.export_variant_vcf(
 3950                        vcf_file=tmp_vcf_name,
 3951                        remove_info=True,
 3952                        add_samples=False,
 3953                        index=True,
 3954                    )
 3955                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3956
 3957                    # Num command
 3958                    nb_command = 0
 3959
 3960                    # Annotate
 3961                    for command_annotate in commands:
 3962                        nb_command += 1
 3963                        log.info(
 3964                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3965                        )
 3966                        log.debug(f"command_annotate={command_annotate}")
 3967                        run_parallel_commands([command_annotate], threads)
 3968
 3969                        # Debug
 3970                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3971
 3972                        # Update variants
 3973                        log.info(
 3974                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3975                        )
 3976                        self.update_from_vcf(commands[command_annotate])
 3977
 3978    def annotation_bcftools(self, threads: int = None) -> None:
 3979        """
 3980        This function annotate with bcftools
 3981
 3982        :param threads: Number of threads to use
 3983        :return: the value of the variable "return_value".
 3984        """
 3985
 3986        # DEBUG
 3987        log.debug("Start annotation with bcftools databases")
 3988
 3989        # Threads
 3990        if not threads:
 3991            threads = self.get_threads()
 3992        log.debug("Threads: " + str(threads))
 3993
 3994        # Config
 3995        config = self.get_config()
 3996        log.debug("Config: " + str(config))
 3997
 3998        # DEBUG
 3999        delete_tmp = True
 4000        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4001            delete_tmp = False
 4002            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4003
 4004        # Config - BCFTools bin command
 4005        bcftools_bin_command = get_bin_command(
 4006            bin="bcftools",
 4007            tool="bcftools",
 4008            bin_type="bin",
 4009            config=config,
 4010            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4011        )
 4012        if not bcftools_bin_command:
 4013            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4014            log.error(msg_err)
 4015            raise ValueError(msg_err)
 4016
 4017        # Config - BCFTools databases folders
 4018        databases_folders = set(
 4019            self.get_config()
 4020            .get("folders", {})
 4021            .get("databases", {})
 4022            .get("annotations", ["."])
 4023            + self.get_config()
 4024            .get("folders", {})
 4025            .get("databases", {})
 4026            .get("bcftools", ["."])
 4027        )
 4028        log.debug("Databases annotations: " + str(databases_folders))
 4029
 4030        # Param
 4031        annotations = (
 4032            self.get_param()
 4033            .get("annotation", {})
 4034            .get("bcftools", {})
 4035            .get("annotations", None)
 4036        )
 4037        log.debug("Annotations: " + str(annotations))
 4038
 4039        # Assembly
 4040        assembly = self.get_param().get(
 4041            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4042        )
 4043
 4044        # Data
 4045        table_variants = self.get_table_variants()
 4046
 4047        # Check if not empty
 4048        log.debug("Check if not empty")
 4049        sql_query_chromosomes = (
 4050            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4051        )
 4052        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4053        if not sql_query_chromosomes_df["count"][0]:
 4054            log.info(f"VCF empty")
 4055            return
 4056
 4057        # Export in VCF
 4058        log.debug("Create initial file to annotate")
 4059        tmp_vcf = NamedTemporaryFile(
 4060            prefix=self.get_prefix(),
 4061            dir=self.get_tmp_dir(),
 4062            suffix=".vcf.gz",
 4063            delete=False,
 4064        )
 4065        tmp_vcf_name = tmp_vcf.name
 4066
 4067        # VCF header
 4068        vcf_reader = self.get_header()
 4069        log.debug("Initial header: " + str(vcf_reader.infos))
 4070
 4071        # Existing annotations
 4072        for vcf_annotation in self.get_header().infos:
 4073
 4074            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4075            log.debug(
 4076                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4077            )
 4078
 4079        if annotations:
 4080
 4081            tmp_ann_vcf_list = []
 4082            commands = []
 4083            tmp_files = []
 4084            err_files = []
 4085
 4086            for annotation in annotations:
 4087                annotation_fields = annotations[annotation]
 4088
 4089                # Annotation Name
 4090                annotation_name = os.path.basename(annotation)
 4091
 4092                if not annotation_fields:
 4093                    annotation_fields = {"INFO": None}
 4094
 4095                log.debug(f"Annotation '{annotation_name}'")
 4096                log.debug(
 4097                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4098                )
 4099
 4100                # Create Database
 4101                database = Database(
 4102                    database=annotation,
 4103                    databases_folders=databases_folders,
 4104                    assembly=assembly,
 4105                )
 4106
 4107                # Find files
 4108                db_file = database.get_database()
 4109                db_file = full_path(db_file)
 4110                db_hdr_file = database.get_header_file()
 4111                db_hdr_file = full_path(db_hdr_file)
 4112                db_file_type = database.get_format()
 4113                db_tbi_file = f"{db_file}.tbi"
 4114                db_file_compressed = database.is_compressed()
 4115
 4116                # Check if compressed
 4117                if not db_file_compressed:
 4118                    log.error(
 4119                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4120                    )
 4121                    raise ValueError(
 4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4123                    )
 4124
 4125                # Check if indexed
 4126                if not os.path.exists(db_tbi_file):
 4127                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4128                    raise ValueError(
 4129                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4130                    )
 4131
 4132                # Check index - try to create if not exists
 4133                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4134                    log.error("Annotation failed: database not valid")
 4135                    log.error(f"Annotation annotation file: {db_file}")
 4136                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4137                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4138                    raise ValueError(
 4139                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4140                    )
 4141                else:
 4142
 4143                    log.debug(
 4144                        f"Annotation '{annotation}' - file: "
 4145                        + str(db_file)
 4146                        + " and "
 4147                        + str(db_hdr_file)
 4148                    )
 4149
 4150                    # Load header as VCF object
 4151                    db_hdr_vcf = Variants(input=db_hdr_file)
 4152                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4153                    log.debug(
 4154                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4155                    )
 4156
 4157                    # For all fields in database
 4158                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4159                        annotation_fields = {
 4160                            key: key for key in db_hdr_vcf_header_infos
 4161                        }
 4162                        log.debug(
 4163                            "Annotation database header - All annotations added: "
 4164                            + str(annotation_fields)
 4165                        )
 4166
 4167                    # Number of fields
 4168                    nb_annotation_field = 0
 4169                    annotation_list = []
 4170
 4171                    for annotation_field in annotation_fields:
 4172
 4173                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4174                        annotation_fields_new_name = annotation_fields.get(
 4175                            annotation_field, annotation_field
 4176                        )
 4177                        if not annotation_fields_new_name:
 4178                            annotation_fields_new_name = annotation_field
 4179
 4180                        # Check if field is in DB and if field is not elready in input data
 4181                        if (
 4182                            annotation_field in db_hdr_vcf.get_header().infos
 4183                            and annotation_fields_new_name
 4184                            not in self.get_header().infos
 4185                        ):
 4186
 4187                            log.info(
 4188                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4189                            )
 4190
 4191                            # Add INFO field to header
 4192                            db_hdr_vcf_header_infos_number = (
 4193                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4194                            )
 4195                            db_hdr_vcf_header_infos_type = (
 4196                                db_hdr_vcf_header_infos[annotation_field].type
 4197                                or "String"
 4198                            )
 4199                            db_hdr_vcf_header_infos_description = (
 4200                                db_hdr_vcf_header_infos[annotation_field].desc
 4201                                or f"{annotation_field} description"
 4202                            )
 4203                            db_hdr_vcf_header_infos_source = (
 4204                                db_hdr_vcf_header_infos[annotation_field].source
 4205                                or "unknown"
 4206                            )
 4207                            db_hdr_vcf_header_infos_version = (
 4208                                db_hdr_vcf_header_infos[annotation_field].version
 4209                                or "unknown"
 4210                            )
 4211
 4212                            vcf_reader.infos[annotation_fields_new_name] = (
 4213                                vcf.parser._Info(
 4214                                    annotation_fields_new_name,
 4215                                    db_hdr_vcf_header_infos_number,
 4216                                    db_hdr_vcf_header_infos_type,
 4217                                    db_hdr_vcf_header_infos_description,
 4218                                    db_hdr_vcf_header_infos_source,
 4219                                    db_hdr_vcf_header_infos_version,
 4220                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4221                                )
 4222                            )
 4223
 4224                            # annotation_list.append(annotation_field)
 4225                            if annotation_field != annotation_fields_new_name:
 4226                                annotation_list.append(
 4227                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4228                                )
 4229                            else:
 4230                                annotation_list.append(annotation_field)
 4231
 4232                            nb_annotation_field += 1
 4233
 4234                        else:
 4235
 4236                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4237                                log.warning(
 4238                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4239                                )
 4240                            if annotation_fields_new_name in self.get_header().infos:
 4241                                log.warning(
 4242                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4243                                )
 4244
 4245                    log.info(
 4246                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4247                    )
 4248
 4249                    annotation_infos = ",".join(annotation_list)
 4250
 4251                    if annotation_infos != "":
 4252
 4253                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4254                        log.debug("Protect Header file - remove #CHROM line if exists")
 4255                        tmp_header_vcf = NamedTemporaryFile(
 4256                            prefix=self.get_prefix(),
 4257                            dir=self.get_tmp_dir(),
 4258                            suffix=".hdr",
 4259                            delete=False,
 4260                        )
 4261                        tmp_header_vcf_name = tmp_header_vcf.name
 4262                        tmp_files.append(tmp_header_vcf_name)
 4263                        # Command
 4264                        if db_hdr_file.endswith(".gz"):
 4265                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4266                        else:
 4267                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4268                        # Run
 4269                        run_parallel_commands([command_extract_header], 1)
 4270
 4271                        # Find chomosomes
 4272                        log.debug("Find chromosomes ")
 4273                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4274                        sql_query_chromosomes_df = self.get_query_to_df(
 4275                            sql_query_chromosomes
 4276                        )
 4277                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4278
 4279                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4280
 4281                        # BED columns in the annotation file
 4282                        if db_file_type in ["bed"]:
 4283                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4284
 4285                        for chrom in chomosomes_list:
 4286
 4287                            # Create BED on initial VCF
 4288                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4289                            tmp_bed = NamedTemporaryFile(
 4290                                prefix=self.get_prefix(),
 4291                                dir=self.get_tmp_dir(),
 4292                                suffix=".bed",
 4293                                delete=False,
 4294                            )
 4295                            tmp_bed_name = tmp_bed.name
 4296                            tmp_files.append(tmp_bed_name)
 4297
 4298                            # Detecte regions
 4299                            log.debug(
 4300                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4301                            )
 4302                            window = 1000000
 4303                            sql_query_intervals_for_bed = f"""
 4304                                SELECT  \"#CHROM\",
 4305                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4306                                        \"POS\"+{window}
 4307                                FROM {table_variants} as table_variants
 4308                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4309                            """
 4310                            regions = self.conn.execute(
 4311                                sql_query_intervals_for_bed
 4312                            ).fetchall()
 4313                            merged_regions = merge_regions(regions)
 4314                            log.debug(
 4315                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4316                            )
 4317
 4318                            header = ["#CHROM", "START", "END"]
 4319                            with open(tmp_bed_name, "w") as f:
 4320                                # Write the header with tab delimiter
 4321                                f.write("\t".join(header) + "\n")
 4322                                for d in merged_regions:
 4323                                    # Write each data row with tab delimiter
 4324                                    f.write("\t".join(map(str, d)) + "\n")
 4325
 4326                            # Tmp files
 4327                            tmp_annotation_vcf = NamedTemporaryFile(
 4328                                prefix=self.get_prefix(),
 4329                                dir=self.get_tmp_dir(),
 4330                                suffix=".vcf.gz",
 4331                                delete=False,
 4332                            )
 4333                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4334                            tmp_files.append(tmp_annotation_vcf_name)
 4335                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4336                            tmp_annotation_vcf_name_err = (
 4337                                tmp_annotation_vcf_name + ".err"
 4338                            )
 4339                            err_files.append(tmp_annotation_vcf_name_err)
 4340
 4341                            # Annotate Command
 4342                            log.debug(
 4343                                f"Annotation '{annotation}' - add bcftools command"
 4344                            )
 4345
 4346                            # Command
 4347                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4348
 4349                            # Add command
 4350                            commands.append(command_annotate)
 4351
 4352            # if some commands
 4353            if commands:
 4354
 4355                # Export VCF file
 4356                self.export_variant_vcf(
 4357                    vcf_file=tmp_vcf_name,
 4358                    remove_info=True,
 4359                    add_samples=False,
 4360                    index=True,
 4361                )
 4362
 4363                # Threads
 4364                # calculate threads for annotated commands
 4365                if commands:
 4366                    threads_bcftools_annotate = round(threads / len(commands))
 4367                else:
 4368                    threads_bcftools_annotate = 1
 4369
 4370                if not threads_bcftools_annotate:
 4371                    threads_bcftools_annotate = 1
 4372
 4373                # Add threads option to bcftools commands
 4374                if threads_bcftools_annotate > 1:
 4375                    commands_threaded = []
 4376                    for command in commands:
 4377                        commands_threaded.append(
 4378                            command.replace(
 4379                                f"{bcftools_bin_command} annotate ",
 4380                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4381                            )
 4382                        )
 4383                    commands = commands_threaded
 4384
 4385                # Command annotation multithreading
 4386                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4387                log.info(
 4388                    f"Annotation - Annotation multithreaded in "
 4389                    + str(len(commands))
 4390                    + " commands"
 4391                )
 4392
 4393                run_parallel_commands(commands, threads)
 4394
 4395                # Merge
 4396                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4397
 4398                if tmp_ann_vcf_list_cmd:
 4399
 4400                    # Tmp file
 4401                    tmp_annotate_vcf = NamedTemporaryFile(
 4402                        prefix=self.get_prefix(),
 4403                        dir=self.get_tmp_dir(),
 4404                        suffix=".vcf.gz",
 4405                        delete=True,
 4406                    )
 4407                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4408                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4409                    err_files.append(tmp_annotate_vcf_name_err)
 4410
 4411                    # Tmp file remove command
 4412                    tmp_files_remove_command = ""
 4413                    if tmp_files:
 4414                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4415
 4416                    # Command merge
 4417                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4418                    log.info(
 4419                        f"Annotation - Annotation merging "
 4420                        + str(len(commands))
 4421                        + " annotated files"
 4422                    )
 4423                    log.debug(f"Annotation - merge command: {merge_command}")
 4424                    run_parallel_commands([merge_command], 1)
 4425
 4426                    # Error messages
 4427                    log.info(f"Error/Warning messages:")
 4428                    error_message_command_all = []
 4429                    error_message_command_warning = []
 4430                    error_message_command_err = []
 4431                    for err_file in err_files:
 4432                        with open(err_file, "r") as f:
 4433                            for line in f:
 4434                                message = line.strip()
 4435                                error_message_command_all.append(message)
 4436                                if line.startswith("[W::"):
 4437                                    error_message_command_warning.append(message)
 4438                                if line.startswith("[E::"):
 4439                                    error_message_command_err.append(
 4440                                        f"{err_file}: " + message
 4441                                    )
 4442                    # log info
 4443                    for message in list(
 4444                        set(error_message_command_err + error_message_command_warning)
 4445                    ):
 4446                        log.info(f"   {message}")
 4447                    # debug info
 4448                    for message in list(set(error_message_command_all)):
 4449                        log.debug(f"   {message}")
 4450                    # failed
 4451                    if len(error_message_command_err):
 4452                        log.error("Annotation failed: Error in commands")
 4453                        raise ValueError("Annotation failed: Error in commands")
 4454
 4455                    # Update variants
 4456                    log.info(f"Annotation - Updating...")
 4457                    self.update_from_vcf(tmp_annotate_vcf_name)
 4458
 4459    def annotation_exomiser(self, threads: int = None) -> None:
 4460        """
 4461        This function annotate with Exomiser
 4462
 4463        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4464        - "analysis" (dict/file):
 4465            Full analysis dictionnary parameters (see Exomiser docs).
 4466            Either a dict, or a file in JSON or YAML format.
 4467            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4468            Default : None
 4469        - "preset" (string):
 4470            Analysis preset (available in config folder).
 4471            Used if no full "analysis" is provided.
 4472            Default: "exome"
 4473        - "phenopacket" (dict/file):
 4474            Samples and phenotipic features parameters (see Exomiser docs).
 4475            Either a dict, or a file in JSON or YAML format.
 4476            Default: None
 4477        - "subject" (dict):
 4478            Sample parameters (see Exomiser docs).
 4479            Example:
 4480                "subject":
 4481                    {
 4482                        "id": "ISDBM322017",
 4483                        "sex": "FEMALE"
 4484                    }
 4485            Default: None
 4486        - "sample" (string):
 4487            Sample name to construct "subject" section:
 4488                "subject":
 4489                    {
 4490                        "id": "<sample>",
 4491                        "sex": "UNKNOWN_SEX"
 4492                    }
 4493            Default: None
 4494        - "phenotypicFeatures" (dict)
 4495            Phenotypic features to construct "subject" section.
 4496            Example:
 4497                "phenotypicFeatures":
 4498                    [
 4499                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4500                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4501                    ]
 4502        - "hpo" (list)
 4503            List of HPO ids as phenotypic features.
 4504            Example:
 4505                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4506            Default: []
 4507        - "outputOptions" (dict):
 4508            Output options (see Exomiser docs).
 4509            Default:
 4510                "output_options" =
 4511                    {
 4512                        "outputContributingVariantsOnly": False,
 4513                        "numGenes": 0,
 4514                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4515                    }
 4516        - "transcript_source" (string):
 4517            Transcript source (either "refseq", "ucsc", "ensembl")
 4518            Default: "refseq"
 4519        - "exomiser_to_info" (boolean):
 4520            Add exomiser TSV file columns as INFO fields in VCF.
 4521            Default: False
 4522        - "release" (string):
 4523            Exomise database release.
 4524            If not exists, database release will be downloaded (take a while).
 4525            Default: None (provided by application.properties configuration file)
 4526        - "exomiser_application_properties" (file):
 4527            Exomiser configuration file (see Exomiser docs).
 4528            Useful to automatically download databases (especially for specific genome databases).
 4529
 4530        Notes:
 4531        - If no sample in parameters, first sample in VCF will be chosen
 4532        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4533
 4534        :param threads: The number of threads to use
 4535        :return: None.
 4536        """
 4537
 4538        # DEBUG
 4539        log.debug("Start annotation with Exomiser databases")
 4540
 4541        # Threads
 4542        if not threads:
 4543            threads = self.get_threads()
 4544        log.debug("Threads: " + str(threads))
 4545
 4546        # Config
 4547        config = self.get_config()
 4548        log.debug("Config: " + str(config))
 4549
 4550        # Config - Folders - Databases
 4551        databases_folders = (
 4552            config.get("folders", {})
 4553            .get("databases", {})
 4554            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4555        )
 4556        databases_folders = full_path(databases_folders)
 4557        if not os.path.exists(databases_folders):
 4558            log.error(f"Databases annotations: {databases_folders} NOT found")
 4559        log.debug("Databases annotations: " + str(databases_folders))
 4560
 4561        # Config - Exomiser
 4562        exomiser_bin_command = get_bin_command(
 4563            bin="exomiser-cli*.jar",
 4564            tool="exomiser",
 4565            bin_type="jar",
 4566            config=config,
 4567            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4568        )
 4569        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4570        if not exomiser_bin_command:
 4571            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4572            log.error(msg_err)
 4573            raise ValueError(msg_err)
 4574
 4575        # Param
 4576        param = self.get_param()
 4577        log.debug("Param: " + str(param))
 4578
 4579        # Param - Exomiser
 4580        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4581        log.debug(f"Param Exomiser: {param_exomiser}")
 4582
 4583        # Param - Assembly
 4584        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4585        log.debug("Assembly: " + str(assembly))
 4586
 4587        # Data
 4588        table_variants = self.get_table_variants()
 4589
 4590        # Check if not empty
 4591        log.debug("Check if not empty")
 4592        sql_query_chromosomes = (
 4593            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4594        )
 4595        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4596            log.info(f"VCF empty")
 4597            return False
 4598
 4599        # VCF header
 4600        vcf_reader = self.get_header()
 4601        log.debug("Initial header: " + str(vcf_reader.infos))
 4602
 4603        # Samples
 4604        samples = self.get_header_sample_list()
 4605        if not samples:
 4606            log.error("No Samples in VCF")
 4607            return False
 4608        log.debug(f"Samples: {samples}")
 4609
 4610        # Memory limit
 4611        memory_limit = self.get_memory("8G")
 4612        log.debug(f"memory_limit: {memory_limit}")
 4613
 4614        # Exomiser java options
 4615        exomiser_java_options = (
 4616            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4617        )
 4618        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4619
 4620        # Download Exomiser (if not exists)
 4621        exomiser_release = param_exomiser.get("release", None)
 4622        exomiser_application_properties = param_exomiser.get(
 4623            "exomiser_application_properties", None
 4624        )
 4625        databases_download_exomiser(
 4626            assemblies=[assembly],
 4627            exomiser_folder=databases_folders,
 4628            exomiser_release=exomiser_release,
 4629            exomiser_phenotype_release=exomiser_release,
 4630            exomiser_application_properties=exomiser_application_properties,
 4631        )
 4632
 4633        # Force annotation
 4634        force_update_annotation = True
 4635
 4636        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4637            log.debug("Start annotation Exomiser")
 4638
 4639            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4640
 4641                # tmp_dir = "/tmp/exomiser"
 4642
 4643                ### ANALYSIS ###
 4644                ################
 4645
 4646                # Create analysis.json through analysis dict
 4647                # either analysis in param or by default
 4648                # depending on preset exome/genome)
 4649
 4650                # Init analysis dict
 4651                param_exomiser_analysis_dict = {}
 4652
 4653                # analysis from param
 4654                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4655                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4656
 4657                # If analysis in param -> load anlaysis json
 4658                if param_exomiser_analysis:
 4659
 4660                    # If param analysis is a file and exists
 4661                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4662                        param_exomiser_analysis
 4663                    ):
 4664                        # Load analysis file into analysis dict (either yaml or json)
 4665                        with open(param_exomiser_analysis) as json_file:
 4666                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4667
 4668                    # If param analysis is a dict
 4669                    elif isinstance(param_exomiser_analysis, dict):
 4670                        # Load analysis dict into analysis dict (either yaml or json)
 4671                        param_exomiser_analysis_dict = param_exomiser_analysis
 4672
 4673                    # Error analysis type
 4674                    else:
 4675                        log.error(f"Analysis type unknown. Check param file.")
 4676                        raise ValueError(f"Analysis type unknown. Check param file.")
 4677
 4678                # Case no input analysis config file/dict
 4679                # Use preset (exome/genome) to open default config file
 4680                if not param_exomiser_analysis_dict:
 4681
 4682                    # default preset
 4683                    default_preset = "exome"
 4684
 4685                    # Get param preset or default preset
 4686                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4687
 4688                    # Try to find if preset is a file
 4689                    if os.path.exists(param_exomiser_preset):
 4690                        # Preset file is provided in full path
 4691                        param_exomiser_analysis_default_config_file = (
 4692                            param_exomiser_preset
 4693                        )
 4694                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4695                    #     # Preset file is provided in full path
 4696                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4697                    elif os.path.exists(
 4698                        os.path.join(folder_config, param_exomiser_preset)
 4699                    ):
 4700                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4701                        param_exomiser_analysis_default_config_file = os.path.join(
 4702                            folder_config, param_exomiser_preset
 4703                        )
 4704                    else:
 4705                        # Construct preset file
 4706                        param_exomiser_analysis_default_config_file = os.path.join(
 4707                            folder_config,
 4708                            f"preset-{param_exomiser_preset}-analysis.json",
 4709                        )
 4710
 4711                    # If preset file exists
 4712                    param_exomiser_analysis_default_config_file = full_path(
 4713                        param_exomiser_analysis_default_config_file
 4714                    )
 4715                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4716                        # Load prest file into analysis dict (either yaml or json)
 4717                        with open(
 4718                            param_exomiser_analysis_default_config_file
 4719                        ) as json_file:
 4720                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4721                                json_file
 4722                            )
 4723
 4724                    # Error preset file
 4725                    else:
 4726                        log.error(
 4727                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4728                        )
 4729                        raise ValueError(
 4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4731                        )
 4732
 4733                # If no analysis dict created
 4734                if not param_exomiser_analysis_dict:
 4735                    log.error(f"No analysis config")
 4736                    raise ValueError(f"No analysis config")
 4737
 4738                # Log
 4739                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4740
 4741                ### PHENOPACKET ###
 4742                ###################
 4743
 4744                # If no PhenoPacket in analysis dict -> check in param
 4745                if "phenopacket" not in param_exomiser_analysis_dict:
 4746
 4747                    # If PhenoPacket in param -> load anlaysis json
 4748                    if param_exomiser.get("phenopacket", None):
 4749
 4750                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4751                        param_exomiser_phenopacket = full_path(
 4752                            param_exomiser_phenopacket
 4753                        )
 4754
 4755                        # If param phenopacket is a file and exists
 4756                        if isinstance(
 4757                            param_exomiser_phenopacket, str
 4758                        ) and os.path.exists(param_exomiser_phenopacket):
 4759                            # Load phenopacket file into analysis dict (either yaml or json)
 4760                            with open(param_exomiser_phenopacket) as json_file:
 4761                                param_exomiser_analysis_dict["phenopacket"] = (
 4762                                    yaml.safe_load(json_file)
 4763                                )
 4764
 4765                        # If param phenopacket is a dict
 4766                        elif isinstance(param_exomiser_phenopacket, dict):
 4767                            # Load phenopacket dict into analysis dict (either yaml or json)
 4768                            param_exomiser_analysis_dict["phenopacket"] = (
 4769                                param_exomiser_phenopacket
 4770                            )
 4771
 4772                        # Error phenopacket type
 4773                        else:
 4774                            log.error(f"Phenopacket type unknown. Check param file.")
 4775                            raise ValueError(
 4776                                f"Phenopacket type unknown. Check param file."
 4777                            )
 4778
 4779                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4780                if "phenopacket" not in param_exomiser_analysis_dict:
 4781
 4782                    # Init PhenoPacket
 4783                    param_exomiser_analysis_dict["phenopacket"] = {
 4784                        "id": "analysis",
 4785                        "proband": {},
 4786                    }
 4787
 4788                    ### Add subject ###
 4789
 4790                    # If subject exists
 4791                    param_exomiser_subject = param_exomiser.get("subject", {})
 4792
 4793                    # If subject not exists -> found sample ID
 4794                    if not param_exomiser_subject:
 4795
 4796                        # Found sample ID in param
 4797                        sample = param_exomiser.get("sample", None)
 4798
 4799                        # Find sample ID (first sample)
 4800                        if not sample:
 4801                            sample_list = self.get_header_sample_list()
 4802                            if len(sample_list) > 0:
 4803                                sample = sample_list[0]
 4804                            else:
 4805                                log.error(f"No sample found")
 4806                                raise ValueError(f"No sample found")
 4807
 4808                        # Create subject
 4809                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4810
 4811                    # Add to dict
 4812                    param_exomiser_analysis_dict["phenopacket"][
 4813                        "subject"
 4814                    ] = param_exomiser_subject
 4815
 4816                    ### Add "phenotypicFeatures" ###
 4817
 4818                    # If phenotypicFeatures exists
 4819                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4820                        "phenotypicFeatures", []
 4821                    )
 4822
 4823                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4824                    if not param_exomiser_phenotypicfeatures:
 4825
 4826                        # Found HPO in param
 4827                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4828
 4829                        # Split HPO if list in string format separated by comma
 4830                        if isinstance(param_exomiser_hpo, str):
 4831                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4832
 4833                        # Create HPO list
 4834                        for hpo in param_exomiser_hpo:
 4835                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4836                            param_exomiser_phenotypicfeatures.append(
 4837                                {
 4838                                    "type": {
 4839                                        "id": f"HP:{hpo_clean}",
 4840                                        "label": f"HP:{hpo_clean}",
 4841                                    }
 4842                                }
 4843                            )
 4844
 4845                    # Add to dict
 4846                    param_exomiser_analysis_dict["phenopacket"][
 4847                        "phenotypicFeatures"
 4848                    ] = param_exomiser_phenotypicfeatures
 4849
 4850                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4851                    if not param_exomiser_phenotypicfeatures:
 4852                        for step in param_exomiser_analysis_dict.get(
 4853                            "analysis", {}
 4854                        ).get("steps", []):
 4855                            if "hiPhivePrioritiser" in step:
 4856                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4857                                    "steps", []
 4858                                ).remove(step)
 4859
 4860                ### Add Input File ###
 4861
 4862                # Initial file name and htsFiles
 4863                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4864                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4865                    {
 4866                        "uri": tmp_vcf_name,
 4867                        "htsFormat": "VCF",
 4868                        "genomeAssembly": assembly,
 4869                    }
 4870                ]
 4871
 4872                ### Add metaData ###
 4873
 4874                # If metaData not in analysis dict
 4875                if "metaData" not in param_exomiser_analysis_dict:
 4876                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4877                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4878                        "createdBy": "howard",
 4879                        "phenopacketSchemaVersion": 1,
 4880                    }
 4881
 4882                ### OutputOptions ###
 4883
 4884                # Init output result folder
 4885                output_results = os.path.join(tmp_dir, "results")
 4886
 4887                # If no outputOptions in analysis dict
 4888                if "outputOptions" not in param_exomiser_analysis_dict:
 4889
 4890                    # default output formats
 4891                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4892
 4893                    # Get outputOptions in param
 4894                    output_options = param_exomiser.get("outputOptions", None)
 4895
 4896                    # If no output_options in param -> check
 4897                    if not output_options:
 4898                        output_options = {
 4899                            "outputContributingVariantsOnly": False,
 4900                            "numGenes": 0,
 4901                            "outputFormats": defaut_output_formats,
 4902                        }
 4903
 4904                    # Replace outputDirectory in output options
 4905                    output_options["outputDirectory"] = output_results
 4906                    output_options["outputFileName"] = "howard"
 4907
 4908                    # Add outputOptions in analysis dict
 4909                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4910
 4911                else:
 4912
 4913                    # Replace output_results and output format (if exists in param)
 4914                    param_exomiser_analysis_dict["outputOptions"][
 4915                        "outputDirectory"
 4916                    ] = output_results
 4917                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4918                        list(
 4919                            set(
 4920                                param_exomiser_analysis_dict.get(
 4921                                    "outputOptions", {}
 4922                                ).get("outputFormats", [])
 4923                                + ["TSV_VARIANT", "VCF"]
 4924                            )
 4925                        )
 4926                    )
 4927
 4928                # log
 4929                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4930
 4931                ### ANALYSIS FILE ###
 4932                #####################
 4933
 4934                ### Full JSON analysis config file ###
 4935
 4936                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4937                with open(exomiser_analysis, "w") as fp:
 4938                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4939
 4940                ### SPLIT analysis and sample config files
 4941
 4942                # Splitted analysis dict
 4943                param_exomiser_analysis_dict_for_split = (
 4944                    param_exomiser_analysis_dict.copy()
 4945                )
 4946
 4947                # Phenopacket JSON file
 4948                exomiser_analysis_phenopacket = os.path.join(
 4949                    tmp_dir, "analysis_phenopacket.json"
 4950                )
 4951                with open(exomiser_analysis_phenopacket, "w") as fp:
 4952                    json.dump(
 4953                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4954                        fp,
 4955                        indent=4,
 4956                    )
 4957
 4958                # Analysis JSON file without Phenopacket parameters
 4959                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4960                exomiser_analysis_analysis = os.path.join(
 4961                    tmp_dir, "analysis_analysis.json"
 4962                )
 4963                with open(exomiser_analysis_analysis, "w") as fp:
 4964                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4965
 4966                ### INITAL VCF file ###
 4967                #######################
 4968
 4969                ### Create list of samples to use and include inti initial VCF file ####
 4970
 4971                # Subject (main sample)
 4972                # Get sample ID in analysis dict
 4973                sample_subject = (
 4974                    param_exomiser_analysis_dict.get("phenopacket", {})
 4975                    .get("subject", {})
 4976                    .get("id", None)
 4977                )
 4978                sample_proband = (
 4979                    param_exomiser_analysis_dict.get("phenopacket", {})
 4980                    .get("proband", {})
 4981                    .get("subject", {})
 4982                    .get("id", None)
 4983                )
 4984                sample = []
 4985                if sample_subject:
 4986                    sample.append(sample_subject)
 4987                if sample_proband:
 4988                    sample.append(sample_proband)
 4989
 4990                # Get sample ID within Pedigree
 4991                pedigree_persons_list = (
 4992                    param_exomiser_analysis_dict.get("phenopacket", {})
 4993                    .get("pedigree", {})
 4994                    .get("persons", {})
 4995                )
 4996
 4997                # Create list with all sample ID in pedigree (if exists)
 4998                pedigree_persons = []
 4999                for person in pedigree_persons_list:
 5000                    pedigree_persons.append(person.get("individualId"))
 5001
 5002                # Concat subject sample ID and samples ID in pedigreesamples
 5003                samples = list(set(sample + pedigree_persons))
 5004
 5005                # Check if sample list is not empty
 5006                if not samples:
 5007                    log.error(f"No samples found")
 5008                    raise ValueError(f"No samples found")
 5009
 5010                # Create VCF with sample (either sample in param or first one by default)
 5011                # Export VCF file
 5012                self.export_variant_vcf(
 5013                    vcf_file=tmp_vcf_name,
 5014                    remove_info=True,
 5015                    add_samples=True,
 5016                    list_samples=samples,
 5017                    index=False,
 5018                )
 5019
 5020                ### Execute Exomiser ###
 5021                ########################
 5022
 5023                # Init command
 5024                exomiser_command = ""
 5025
 5026                # Command exomiser options
 5027                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5028
 5029                # Release
 5030                exomiser_release = param_exomiser.get("release", None)
 5031                if exomiser_release:
 5032                    # phenotype data version
 5033                    exomiser_options += (
 5034                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5035                    )
 5036                    # data version
 5037                    exomiser_options += (
 5038                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5039                    )
 5040                    # variant white list
 5041                    variant_white_list_file = (
 5042                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5043                    )
 5044                    if os.path.exists(
 5045                        os.path.join(
 5046                            databases_folders, assembly, variant_white_list_file
 5047                        )
 5048                    ):
 5049                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5050
 5051                # transcript_source
 5052                transcript_source = param_exomiser.get(
 5053                    "transcript_source", None
 5054                )  # ucsc, refseq, ensembl
 5055                if transcript_source:
 5056                    exomiser_options += (
 5057                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5058                    )
 5059
 5060                # If analysis contain proband param
 5061                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5062                    "proband", {}
 5063                ):
 5064                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5065
 5066                # If no proband (usually uniq sample)
 5067                else:
 5068                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5069
 5070                # Log
 5071                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5072
 5073                # Run command
 5074                result = subprocess.call(
 5075                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5076                )
 5077                if result:
 5078                    log.error("Exomiser command failed")
 5079                    raise ValueError("Exomiser command failed")
 5080
 5081                ### RESULTS ###
 5082                ###############
 5083
 5084                ### Annotate with TSV fields ###
 5085
 5086                # Init result tsv file
 5087                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5088
 5089                # Init result tsv file
 5090                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5091
 5092                # Parse TSV file and explode columns in INFO field
 5093                if exomiser_to_info and os.path.exists(output_results_tsv):
 5094
 5095                    # Log
 5096                    log.debug("Exomiser columns to VCF INFO field")
 5097
 5098                    # Retrieve columns and types
 5099                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5100                    output_results_tsv_df = self.get_query_to_df(query)
 5101                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5102
 5103                    # Init concat fields for update
 5104                    sql_query_update_concat_fields = []
 5105
 5106                    # Fields to avoid
 5107                    fields_to_avoid = [
 5108                        "CONTIG",
 5109                        "START",
 5110                        "END",
 5111                        "REF",
 5112                        "ALT",
 5113                        "QUAL",
 5114                        "FILTER",
 5115                        "GENOTYPE",
 5116                    ]
 5117
 5118                    # List all columns to add into header
 5119                    for header_column in output_results_tsv_columns:
 5120
 5121                        # If header column is enable
 5122                        if header_column not in fields_to_avoid:
 5123
 5124                            # Header info type
 5125                            header_info_type = "String"
 5126                            header_column_df = output_results_tsv_df[header_column]
 5127                            header_column_df_dtype = header_column_df.dtype
 5128                            if header_column_df_dtype == object:
 5129                                if (
 5130                                    pd.to_numeric(header_column_df, errors="coerce")
 5131                                    .notnull()
 5132                                    .all()
 5133                                ):
 5134                                    header_info_type = "Float"
 5135                            else:
 5136                                header_info_type = "Integer"
 5137
 5138                            # Header info
 5139                            characters_to_validate = ["-"]
 5140                            pattern = "[" + "".join(characters_to_validate) + "]"
 5141                            header_info_name = re.sub(
 5142                                pattern,
 5143                                "_",
 5144                                f"Exomiser_{header_column}".replace("#", ""),
 5145                            )
 5146                            header_info_number = "."
 5147                            header_info_description = (
 5148                                f"Exomiser {header_column} annotation"
 5149                            )
 5150                            header_info_source = "Exomiser"
 5151                            header_info_version = "unknown"
 5152                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5153                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5154                                header_info_name,
 5155                                header_info_number,
 5156                                header_info_type,
 5157                                header_info_description,
 5158                                header_info_source,
 5159                                header_info_version,
 5160                                header_info_code,
 5161                            )
 5162
 5163                            # Add field to add for update to concat fields
 5164                            sql_query_update_concat_fields.append(
 5165                                f"""
 5166                                CASE
 5167                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5168                                    THEN concat(
 5169                                        '{header_info_name}=',
 5170                                        table_parquet."{header_column}",
 5171                                        ';'
 5172                                        )
 5173
 5174                                    ELSE ''
 5175                                END
 5176                            """
 5177                            )
 5178
 5179                    # Update query
 5180                    sql_query_update = f"""
 5181                        UPDATE {table_variants} as table_variants
 5182                            SET INFO = concat(
 5183                                            CASE
 5184                                                WHEN INFO NOT IN ('', '.')
 5185                                                THEN INFO
 5186                                                ELSE ''
 5187                                            END,
 5188                                            CASE
 5189                                                WHEN table_variants.INFO NOT IN ('','.')
 5190                                                THEN ';'
 5191                                                ELSE ''
 5192                                            END,
 5193                                            (
 5194                                            SELECT 
 5195                                                concat(
 5196                                                    {",".join(sql_query_update_concat_fields)}
 5197                                                )
 5198                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5199                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5200                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5201                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5202                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5203                                            )
 5204                                        )
 5205                            ;
 5206                        """
 5207
 5208                    # Update
 5209                    self.conn.execute(sql_query_update)
 5210
 5211                ### Annotate with VCF INFO field ###
 5212
 5213                # Init result VCF file
 5214                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5215
 5216                # If VCF exists
 5217                if os.path.exists(output_results_vcf):
 5218
 5219                    # Log
 5220                    log.debug("Exomiser result VCF update variants")
 5221
 5222                    # Find Exomiser INFO field annotation in header
 5223                    with gzip.open(output_results_vcf, "rt") as f:
 5224                        header_list = self.read_vcf_header(f)
 5225                    exomiser_vcf_header = vcf.Reader(
 5226                        io.StringIO("\n".join(header_list))
 5227                    )
 5228
 5229                    # Add annotation INFO field to header
 5230                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5231
 5232                    # Update variants with VCF
 5233                    self.update_from_vcf(output_results_vcf)
 5234
 5235        return True
 5236
 5237    def annotation_snpeff(self, threads: int = None) -> None:
 5238        """
 5239        This function annotate with snpEff
 5240
 5241        :param threads: The number of threads to use
 5242        :return: the value of the variable "return_value".
 5243        """
 5244
 5245        # DEBUG
 5246        log.debug("Start annotation with snpeff databases")
 5247
 5248        # Threads
 5249        if not threads:
 5250            threads = self.get_threads()
 5251        log.debug("Threads: " + str(threads))
 5252
 5253        # DEBUG
 5254        delete_tmp = True
 5255        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5256            delete_tmp = False
 5257            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5258
 5259        # Config
 5260        config = self.get_config()
 5261        log.debug("Config: " + str(config))
 5262
 5263        # Config - Folders - Databases
 5264        databases_folders = (
 5265            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5266        )
 5267        log.debug("Databases annotations: " + str(databases_folders))
 5268
 5269        # Config - snpEff bin command
 5270        snpeff_bin_command = get_bin_command(
 5271            bin="snpEff.jar",
 5272            tool="snpeff",
 5273            bin_type="jar",
 5274            config=config,
 5275            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5276        )
 5277        if not snpeff_bin_command:
 5278            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5279            log.error(msg_err)
 5280            raise ValueError(msg_err)
 5281
 5282        # Config - snpEff databases
 5283        snpeff_databases = (
 5284            config.get("folders", {})
 5285            .get("databases", {})
 5286            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5287        )
 5288        snpeff_databases = full_path(snpeff_databases)
 5289        if snpeff_databases is not None and snpeff_databases != "":
 5290            log.debug(f"Create snpEff databases folder")
 5291            if not os.path.exists(snpeff_databases):
 5292                os.makedirs(snpeff_databases)
 5293
 5294        # Param
 5295        param = self.get_param()
 5296        log.debug("Param: " + str(param))
 5297
 5298        # Param
 5299        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5300        log.debug("Options: " + str(options))
 5301
 5302        # Param - Assembly
 5303        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5304
 5305        # Param - Options
 5306        snpeff_options = (
 5307            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5308        )
 5309        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5310        snpeff_csvstats = (
 5311            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5312        )
 5313        if snpeff_stats:
 5314            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5315            snpeff_stats = full_path(snpeff_stats)
 5316            snpeff_options += f" -stats {snpeff_stats}"
 5317        if snpeff_csvstats:
 5318            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5319            snpeff_csvstats = full_path(snpeff_csvstats)
 5320            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5321
 5322        # Data
 5323        table_variants = self.get_table_variants()
 5324
 5325        # Check if not empty
 5326        log.debug("Check if not empty")
 5327        sql_query_chromosomes = (
 5328            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5329        )
 5330        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5331        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5332            log.info(f"VCF empty")
 5333            return
 5334
 5335        # Export in VCF
 5336        log.debug("Create initial file to annotate")
 5337        tmp_vcf = NamedTemporaryFile(
 5338            prefix=self.get_prefix(),
 5339            dir=self.get_tmp_dir(),
 5340            suffix=".vcf.gz",
 5341            delete=True,
 5342        )
 5343        tmp_vcf_name = tmp_vcf.name
 5344
 5345        # VCF header
 5346        vcf_reader = self.get_header()
 5347        log.debug("Initial header: " + str(vcf_reader.infos))
 5348
 5349        # Existing annotations
 5350        for vcf_annotation in self.get_header().infos:
 5351
 5352            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5353            log.debug(
 5354                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5355            )
 5356
 5357        # Memory limit
 5358        # if config.get("memory", None):
 5359        #     memory_limit = config.get("memory", "8G")
 5360        # else:
 5361        #     memory_limit = "8G"
 5362        memory_limit = self.get_memory("8G")
 5363        log.debug(f"memory_limit: {memory_limit}")
 5364
 5365        # snpEff java options
 5366        snpeff_java_options = (
 5367            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5368        )
 5369        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5370
 5371        force_update_annotation = True
 5372
 5373        if "ANN" not in self.get_header().infos or force_update_annotation:
 5374
 5375            # Check snpEff database
 5376            log.debug(f"Check snpEff databases {[assembly]}")
 5377            databases_download_snpeff(
 5378                folder=snpeff_databases, assemblies=[assembly], config=config
 5379            )
 5380
 5381            # Export VCF file
 5382            self.export_variant_vcf(
 5383                vcf_file=tmp_vcf_name,
 5384                remove_info=True,
 5385                add_samples=False,
 5386                index=True,
 5387            )
 5388
 5389            # Tmp file
 5390            err_files = []
 5391            tmp_annotate_vcf = NamedTemporaryFile(
 5392                prefix=self.get_prefix(),
 5393                dir=self.get_tmp_dir(),
 5394                suffix=".vcf",
 5395                delete=False,
 5396            )
 5397            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5398            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5399            err_files.append(tmp_annotate_vcf_name_err)
 5400
 5401            # Command
 5402            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5403            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5404            run_parallel_commands([snpeff_command], 1)
 5405
 5406            # Error messages
 5407            log.info(f"Error/Warning messages:")
 5408            error_message_command_all = []
 5409            error_message_command_warning = []
 5410            error_message_command_err = []
 5411            for err_file in err_files:
 5412                with open(err_file, "r") as f:
 5413                    for line in f:
 5414                        message = line.strip()
 5415                        error_message_command_all.append(message)
 5416                        if line.startswith("[W::"):
 5417                            error_message_command_warning.append(message)
 5418                        if line.startswith("[E::"):
 5419                            error_message_command_err.append(f"{err_file}: " + message)
 5420            # log info
 5421            for message in list(
 5422                set(error_message_command_err + error_message_command_warning)
 5423            ):
 5424                log.info(f"   {message}")
 5425            # debug info
 5426            for message in list(set(error_message_command_all)):
 5427                log.debug(f"   {message}")
 5428            # failed
 5429            if len(error_message_command_err):
 5430                log.error("Annotation failed: Error in commands")
 5431                raise ValueError("Annotation failed: Error in commands")
 5432
 5433            # Find annotation in header
 5434            with open(tmp_annotate_vcf_name, "rt") as f:
 5435                header_list = self.read_vcf_header(f)
 5436            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5437
 5438            for ann in annovar_vcf_header.infos:
 5439                if ann not in self.get_header().infos:
 5440                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5441
 5442            # Update variants
 5443            log.info(f"Annotation - Updating...")
 5444            self.update_from_vcf(tmp_annotate_vcf_name)
 5445
 5446        else:
 5447            if "ANN" in self.get_header().infos:
 5448                log.debug(f"Existing snpEff annotations in VCF")
 5449            if force_update_annotation:
 5450                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5451
 5452    def annotation_annovar(self, threads: int = None) -> None:
 5453        """
 5454        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5455        annotations
 5456
 5457        :param threads: number of threads to use
 5458        :return: the value of the variable "return_value".
 5459        """
 5460
 5461        # DEBUG
 5462        log.debug("Start annotation with Annovar databases")
 5463
 5464        # Threads
 5465        if not threads:
 5466            threads = self.get_threads()
 5467        log.debug("Threads: " + str(threads))
 5468
 5469        # Tmp en Err files
 5470        tmp_files = []
 5471        err_files = []
 5472
 5473        # DEBUG
 5474        delete_tmp = True
 5475        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5476            delete_tmp = False
 5477            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5478
 5479        # Config
 5480        config = self.get_config()
 5481        log.debug("Config: " + str(config))
 5482
 5483        # Config - Folders - Databases
 5484        databases_folders = (
 5485            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5486        )
 5487        log.debug("Databases annotations: " + str(databases_folders))
 5488
 5489        # Config - annovar bin command
 5490        annovar_bin_command = get_bin_command(
 5491            bin="table_annovar.pl",
 5492            tool="annovar",
 5493            bin_type="perl",
 5494            config=config,
 5495            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5496        )
 5497        if not annovar_bin_command:
 5498            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5499            log.error(msg_err)
 5500            raise ValueError(msg_err)
 5501
 5502        # Config - BCFTools bin command
 5503        bcftools_bin_command = get_bin_command(
 5504            bin="bcftools",
 5505            tool="bcftools",
 5506            bin_type="bin",
 5507            config=config,
 5508            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5509        )
 5510        if not bcftools_bin_command:
 5511            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5512            log.error(msg_err)
 5513            raise ValueError(msg_err)
 5514
 5515        # Config - annovar databases
 5516        annovar_databases = (
 5517            config.get("folders", {})
 5518            .get("databases", {})
 5519            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5520        )
 5521        if annovar_databases is not None:
 5522            if isinstance(annovar_databases, list):
 5523                annovar_databases = full_path(annovar_databases[0])
 5524                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5525            annovar_databases = full_path(annovar_databases)
 5526            if not os.path.exists(annovar_databases):
 5527                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5528                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5529        else:
 5530            msg_err = f"Annovar databases configuration failed"
 5531            log.error(msg_err)
 5532            raise ValueError(msg_err)
 5533
 5534        # Param
 5535        param = self.get_param()
 5536        log.debug("Param: " + str(param))
 5537
 5538        # Param - options
 5539        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5540        log.debug("Options: " + str(options))
 5541
 5542        # Param - annotations
 5543        annotations = (
 5544            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5545        )
 5546        log.debug("Annotations: " + str(annotations))
 5547
 5548        # Param - Assembly
 5549        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5550
 5551        # Annovar database assembly
 5552        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5553        if annovar_databases_assembly != "" and not os.path.exists(
 5554            annovar_databases_assembly
 5555        ):
 5556            os.makedirs(annovar_databases_assembly)
 5557
 5558        # Data
 5559        table_variants = self.get_table_variants()
 5560
 5561        # Check if not empty
 5562        log.debug("Check if not empty")
 5563        sql_query_chromosomes = (
 5564            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5565        )
 5566        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5567        if not sql_query_chromosomes_df["count"][0]:
 5568            log.info(f"VCF empty")
 5569            return
 5570
 5571        # VCF header
 5572        vcf_reader = self.get_header()
 5573        log.debug("Initial header: " + str(vcf_reader.infos))
 5574
 5575        # Existing annotations
 5576        for vcf_annotation in self.get_header().infos:
 5577
 5578            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5579            log.debug(
 5580                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5581            )
 5582
 5583        force_update_annotation = True
 5584
 5585        if annotations:
 5586
 5587            commands = []
 5588            tmp_annotates_vcf_name_list = []
 5589
 5590            # Export in VCF
 5591            log.debug("Create initial file to annotate")
 5592            tmp_vcf = NamedTemporaryFile(
 5593                prefix=self.get_prefix(),
 5594                dir=self.get_tmp_dir(),
 5595                suffix=".vcf.gz",
 5596                delete=False,
 5597            )
 5598            tmp_vcf_name = tmp_vcf.name
 5599            tmp_files.append(tmp_vcf_name)
 5600            tmp_files.append(tmp_vcf_name + ".tbi")
 5601
 5602            # Export VCF file
 5603            self.export_variant_vcf(
 5604                vcf_file=tmp_vcf_name,
 5605                remove_info=".",
 5606                add_samples=False,
 5607                index=True,
 5608            )
 5609
 5610            # Create file for field rename
 5611            log.debug("Create file for field rename")
 5612            tmp_rename = NamedTemporaryFile(
 5613                prefix=self.get_prefix(),
 5614                dir=self.get_tmp_dir(),
 5615                suffix=".rename",
 5616                delete=False,
 5617            )
 5618            tmp_rename_name = tmp_rename.name
 5619            tmp_files.append(tmp_rename_name)
 5620
 5621            # Check Annovar database
 5622            log.debug(
 5623                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5624            )
 5625            databases_download_annovar(
 5626                folder=annovar_databases,
 5627                files=list(annotations.keys()),
 5628                assemblies=[assembly],
 5629            )
 5630
 5631            for annotation in annotations:
 5632                annotation_fields = annotations[annotation]
 5633
 5634                if not annotation_fields:
 5635                    annotation_fields = {"INFO": None}
 5636
 5637                log.info(f"Annotations Annovar - database '{annotation}'")
 5638                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5639
 5640                # Tmp file for annovar
 5641                err_files = []
 5642                tmp_annotate_vcf_directory = TemporaryDirectory(
 5643                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5644                )
 5645                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5646                tmp_annotate_vcf_name_annovar = (
 5647                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5648                )
 5649                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5650                err_files.append(tmp_annotate_vcf_name_err)
 5651                tmp_files.append(tmp_annotate_vcf_name_err)
 5652
 5653                # Tmp file final vcf annotated by annovar
 5654                tmp_annotate_vcf = NamedTemporaryFile(
 5655                    prefix=self.get_prefix(),
 5656                    dir=self.get_tmp_dir(),
 5657                    suffix=".vcf.gz",
 5658                    delete=False,
 5659                )
 5660                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5661                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5662                tmp_files.append(tmp_annotate_vcf_name)
 5663                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5664
 5665                # Number of fields
 5666                annotation_list = []
 5667                annotation_renamed_list = []
 5668
 5669                for annotation_field in annotation_fields:
 5670
 5671                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5672                    annotation_fields_new_name = annotation_fields.get(
 5673                        annotation_field, annotation_field
 5674                    )
 5675                    if not annotation_fields_new_name:
 5676                        annotation_fields_new_name = annotation_field
 5677
 5678                    if (
 5679                        force_update_annotation
 5680                        or annotation_fields_new_name not in self.get_header().infos
 5681                    ):
 5682                        annotation_list.append(annotation_field)
 5683                        annotation_renamed_list.append(annotation_fields_new_name)
 5684                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5685                        log.warning(
 5686                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5687                        )
 5688
 5689                    # Add rename info
 5690                    run_parallel_commands(
 5691                        [
 5692                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5693                        ],
 5694                        1,
 5695                    )
 5696
 5697                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5698                log.debug("annotation_list: " + str(annotation_list))
 5699
 5700                # protocol
 5701                protocol = annotation
 5702
 5703                # argument
 5704                argument = ""
 5705
 5706                # operation
 5707                operation = "f"
 5708                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5709                    "ensGene"
 5710                ):
 5711                    operation = "g"
 5712                    if options.get("genebase", None):
 5713                        argument = f"""'{options.get("genebase","")}'"""
 5714                elif annotation in ["cytoBand"]:
 5715                    operation = "r"
 5716
 5717                # argument option
 5718                argument_option = ""
 5719                if argument != "":
 5720                    argument_option = " --argument " + argument
 5721
 5722                # command options
 5723                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5724                for option in options:
 5725                    if option not in ["genebase"]:
 5726                        command_options += f""" --{option}={options[option]}"""
 5727
 5728                # Command
 5729
 5730                # Command - Annovar
 5731                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5732                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5733
 5734                # Command - start pipe
 5735                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5736
 5737                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5738                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5739
 5740                # Command - Special characters (refGene annotation)
 5741                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5742
 5743                # Command - Clean empty fields (with value ".")
 5744                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5745
 5746                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5747                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5748                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5749                    # for ann in annotation_renamed_list:
 5750                    for ann in annotation_list:
 5751                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5752
 5753                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5754
 5755                # Command - indexing
 5756                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5757
 5758                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5759                run_parallel_commands([command_annovar], 1)
 5760
 5761                # Error messages
 5762                log.info(f"Error/Warning messages:")
 5763                error_message_command_all = []
 5764                error_message_command_warning = []
 5765                error_message_command_err = []
 5766                for err_file in err_files:
 5767                    with open(err_file, "r") as f:
 5768                        for line in f:
 5769                            message = line.strip()
 5770                            error_message_command_all.append(message)
 5771                            if line.startswith("[W::") or line.startswith("WARNING"):
 5772                                error_message_command_warning.append(message)
 5773                            if line.startswith("[E::") or line.startswith("ERROR"):
 5774                                error_message_command_err.append(
 5775                                    f"{err_file}: " + message
 5776                                )
 5777                # log info
 5778                for message in list(
 5779                    set(error_message_command_err + error_message_command_warning)
 5780                ):
 5781                    log.info(f"   {message}")
 5782                # debug info
 5783                for message in list(set(error_message_command_all)):
 5784                    log.debug(f"   {message}")
 5785                # failed
 5786                if len(error_message_command_err):
 5787                    log.error("Annotation failed: Error in commands")
 5788                    raise ValueError("Annotation failed: Error in commands")
 5789
 5790            if tmp_annotates_vcf_name_list:
 5791
 5792                # List of annotated files
 5793                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5794
 5795                # Tmp file
 5796                tmp_annotate_vcf = NamedTemporaryFile(
 5797                    prefix=self.get_prefix(),
 5798                    dir=self.get_tmp_dir(),
 5799                    suffix=".vcf.gz",
 5800                    delete=False,
 5801                )
 5802                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5803                tmp_files.append(tmp_annotate_vcf_name)
 5804                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5805                err_files.append(tmp_annotate_vcf_name_err)
 5806                tmp_files.append(tmp_annotate_vcf_name_err)
 5807
 5808                # Command merge
 5809                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5810                log.info(
 5811                    f"Annotation Annovar - Annotation merging "
 5812                    + str(len(tmp_annotates_vcf_name_list))
 5813                    + " annotated files"
 5814                )
 5815                log.debug(f"Annotation - merge command: {merge_command}")
 5816                run_parallel_commands([merge_command], 1)
 5817
 5818                # Find annotation in header
 5819                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5820                    header_list = self.read_vcf_header(f)
 5821                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5822
 5823                for ann in annovar_vcf_header.infos:
 5824                    if ann not in self.get_header().infos:
 5825                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5826
 5827                # Update variants
 5828                log.info(f"Annotation Annovar - Updating...")
 5829                self.update_from_vcf(tmp_annotate_vcf_name)
 5830
 5831            # Clean files
 5832            # Tmp file remove command
 5833            if True:
 5834                tmp_files_remove_command = ""
 5835                if tmp_files:
 5836                    tmp_files_remove_command = " ".join(tmp_files)
 5837                clean_command = f" rm -f {tmp_files_remove_command} "
 5838                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5839                log.debug(f"Annotation - cleaning command: {clean_command}")
 5840                run_parallel_commands([clean_command], 1)
 5841
 5842    # Parquet
 5843    def annotation_parquet(self, threads: int = None) -> None:
 5844        """
 5845        It takes a VCF file, and annotates it with a parquet file
 5846
 5847        :param threads: number of threads to use for the annotation
 5848        :return: the value of the variable "result".
 5849        """
 5850
 5851        # DEBUG
 5852        log.debug("Start annotation with parquet databases")
 5853
 5854        # Threads
 5855        if not threads:
 5856            threads = self.get_threads()
 5857        log.debug("Threads: " + str(threads))
 5858
 5859        # DEBUG
 5860        delete_tmp = True
 5861        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5862            delete_tmp = False
 5863            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5864
 5865        # Config
 5866        databases_folders = set(
 5867            self.get_config()
 5868            .get("folders", {})
 5869            .get("databases", {})
 5870            .get("annotations", ["."])
 5871            + self.get_config()
 5872            .get("folders", {})
 5873            .get("databases", {})
 5874            .get("parquet", ["."])
 5875        )
 5876        log.debug("Databases annotations: " + str(databases_folders))
 5877
 5878        # Param
 5879        annotations = (
 5880            self.get_param()
 5881            .get("annotation", {})
 5882            .get("parquet", {})
 5883            .get("annotations", None)
 5884        )
 5885        log.debug("Annotations: " + str(annotations))
 5886
 5887        # Assembly
 5888        assembly = self.get_param().get(
 5889            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5890        )
 5891
 5892        # Force Update Annotation
 5893        force_update_annotation = (
 5894            self.get_param()
 5895            .get("annotation", {})
 5896            .get("options", {})
 5897            .get("annotations_update", False)
 5898        )
 5899        log.debug(f"force_update_annotation={force_update_annotation}")
 5900        force_append_annotation = (
 5901            self.get_param()
 5902            .get("annotation", {})
 5903            .get("options", {})
 5904            .get("annotations_append", False)
 5905        )
 5906        log.debug(f"force_append_annotation={force_append_annotation}")
 5907
 5908        # Data
 5909        table_variants = self.get_table_variants()
 5910
 5911        # Check if not empty
 5912        log.debug("Check if not empty")
 5913        sql_query_chromosomes_df = self.get_query_to_df(
 5914            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5915        )
 5916        if not sql_query_chromosomes_df["count"][0]:
 5917            log.info(f"VCF empty")
 5918            return
 5919
 5920        # VCF header
 5921        vcf_reader = self.get_header()
 5922        log.debug("Initial header: " + str(vcf_reader.infos))
 5923
 5924        # Nb Variants POS
 5925        log.debug("NB Variants Start")
 5926        nb_variants = self.conn.execute(
 5927            f"SELECT count(*) AS count FROM variants"
 5928        ).fetchdf()["count"][0]
 5929        log.debug("NB Variants Stop")
 5930
 5931        # Existing annotations
 5932        for vcf_annotation in self.get_header().infos:
 5933
 5934            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5935            log.debug(
 5936                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5937            )
 5938
 5939        # Added columns
 5940        added_columns = []
 5941
 5942        # drop indexes
 5943        log.debug(f"Drop indexes...")
 5944        self.drop_indexes()
 5945
 5946        if annotations:
 5947
 5948            if "ALL" in annotations:
 5949
 5950                all_param = annotations.get("ALL", {})
 5951                all_param_formats = all_param.get("formats", None)
 5952                all_param_releases = all_param.get("releases", None)
 5953
 5954                databases_infos_dict = self.scan_databases(
 5955                    database_formats=all_param_formats,
 5956                    database_releases=all_param_releases,
 5957                )
 5958                for database_infos in databases_infos_dict.keys():
 5959                    if database_infos not in annotations:
 5960                        annotations[database_infos] = {"INFO": None}
 5961
 5962            for annotation in annotations:
 5963
 5964                if annotation in ["ALL"]:
 5965                    continue
 5966
 5967                # Annotation Name
 5968                annotation_name = os.path.basename(annotation)
 5969
 5970                # Annotation fields
 5971                annotation_fields = annotations[annotation]
 5972                if not annotation_fields:
 5973                    annotation_fields = {"INFO": None}
 5974
 5975                log.debug(f"Annotation '{annotation_name}'")
 5976                log.debug(
 5977                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5978                )
 5979
 5980                # Create Database
 5981                database = Database(
 5982                    database=annotation,
 5983                    databases_folders=databases_folders,
 5984                    assembly=assembly,
 5985                )
 5986
 5987                # Find files
 5988                parquet_file = database.get_database()
 5989                parquet_hdr_file = database.get_header_file()
 5990                parquet_type = database.get_type()
 5991
 5992                # Check if files exists
 5993                if not parquet_file or not parquet_hdr_file:
 5994                    msg_err_list = []
 5995                    if not parquet_file:
 5996                        msg_err_list.append(
 5997                            f"Annotation failed: Annotation file not found"
 5998                        )
 5999                    if parquet_file and not parquet_hdr_file:
 6000                        msg_err_list.append(
 6001                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6002                        )
 6003
 6004                    log.error(". ".join(msg_err_list))
 6005                    raise ValueError(". ".join(msg_err_list))
 6006                else:
 6007                    # Get parquet connexion
 6008                    parquet_sql_attach = database.get_sql_database_attach(
 6009                        output="query"
 6010                    )
 6011                    if parquet_sql_attach:
 6012                        self.conn.execute(parquet_sql_attach)
 6013                    parquet_file_link = database.get_sql_database_link()
 6014                    # Log
 6015                    log.debug(
 6016                        f"Annotation '{annotation_name}' - file: "
 6017                        + str(parquet_file)
 6018                        + " and "
 6019                        + str(parquet_hdr_file)
 6020                    )
 6021
 6022                    # Database full header columns
 6023                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6024                        parquet_hdr_file
 6025                    )
 6026                    # Log
 6027                    log.debug(
 6028                        "Annotation database header columns : "
 6029                        + str(parquet_hdr_vcf_header_columns)
 6030                    )
 6031
 6032                    # Load header as VCF object
 6033                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6034                    # Log
 6035                    log.debug(
 6036                        "Annotation database header: "
 6037                        + str(parquet_hdr_vcf_header_infos)
 6038                    )
 6039
 6040                    # Get extra infos
 6041                    parquet_columns = database.get_extra_columns()
 6042                    # Log
 6043                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6044
 6045                    # Add extra columns if "ALL" in annotation_fields
 6046                    # if "ALL" in annotation_fields:
 6047                    #     allow_add_extra_column = True
 6048                    if "ALL" in annotation_fields and database.get_extra_columns():
 6049                        for extra_column in database.get_extra_columns():
 6050                            if (
 6051                                extra_column not in annotation_fields
 6052                                and extra_column.replace("INFO/", "")
 6053                                not in parquet_hdr_vcf_header_infos
 6054                            ):
 6055                                parquet_hdr_vcf_header_infos[extra_column] = (
 6056                                    vcf.parser._Info(
 6057                                        extra_column,
 6058                                        ".",
 6059                                        "String",
 6060                                        f"{extra_column} description",
 6061                                        "unknown",
 6062                                        "unknown",
 6063                                        self.code_type_map["String"],
 6064                                    )
 6065                                )
 6066
 6067                    # For all fields in database
 6068                    annotation_fields_all = False
 6069                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6070                        annotation_fields_all = True
 6071                        annotation_fields = {
 6072                            key: key for key in parquet_hdr_vcf_header_infos
 6073                        }
 6074
 6075                        log.debug(
 6076                            "Annotation database header - All annotations added: "
 6077                            + str(annotation_fields)
 6078                        )
 6079
 6080                    # Init
 6081
 6082                    # List of annotation fields to use
 6083                    sql_query_annotation_update_info_sets = []
 6084
 6085                    # List of annotation to agregate
 6086                    sql_query_annotation_to_agregate = []
 6087
 6088                    # Number of fields
 6089                    nb_annotation_field = 0
 6090
 6091                    # Annotation fields processed
 6092                    annotation_fields_processed = []
 6093
 6094                    # Columns mapping
 6095                    map_columns = database.map_columns(
 6096                        columns=annotation_fields, prefixes=["INFO/"]
 6097                    )
 6098
 6099                    # Query dict for fields to remove (update option)
 6100                    query_dict_remove = {}
 6101
 6102                    # Fetch Anotation fields
 6103                    for annotation_field in annotation_fields:
 6104
 6105                        # annotation_field_column
 6106                        annotation_field_column = map_columns.get(
 6107                            annotation_field, "INFO"
 6108                        )
 6109
 6110                        # field new name, if parametered
 6111                        annotation_fields_new_name = annotation_fields.get(
 6112                            annotation_field, annotation_field
 6113                        )
 6114                        if not annotation_fields_new_name:
 6115                            annotation_fields_new_name = annotation_field
 6116
 6117                        # To annotate
 6118                        # force_update_annotation = True
 6119                        # force_append_annotation = True
 6120                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6121                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6122                            force_update_annotation
 6123                            or force_append_annotation
 6124                            or (
 6125                                annotation_fields_new_name
 6126                                not in self.get_header().infos
 6127                            )
 6128                        ):
 6129
 6130                            # Add field to annotation to process list
 6131                            annotation_fields_processed.append(
 6132                                annotation_fields_new_name
 6133                            )
 6134
 6135                            # explode infos for the field
 6136                            annotation_fields_new_name_info_msg = ""
 6137                            if (
 6138                                force_update_annotation
 6139                                and annotation_fields_new_name
 6140                                in self.get_header().infos
 6141                            ):
 6142                                # Remove field from INFO
 6143                                query = f"""
 6144                                    UPDATE {table_variants} as table_variants
 6145                                    SET INFO = REGEXP_REPLACE(
 6146                                                concat(table_variants.INFO,''),
 6147                                                ';*{annotation_fields_new_name}=[^;]*',
 6148                                                ''
 6149                                                )
 6150                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6151                                """
 6152                                annotation_fields_new_name_info_msg = " [update]"
 6153                                query_dict_remove[
 6154                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6155                                ] = query
 6156
 6157                            # Sep between fields in INFO
 6158                            nb_annotation_field += 1
 6159                            if nb_annotation_field > 1:
 6160                                annotation_field_sep = ";"
 6161                            else:
 6162                                annotation_field_sep = ""
 6163
 6164                            log.info(
 6165                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6166                            )
 6167
 6168                            # Add INFO field to header
 6169                            parquet_hdr_vcf_header_infos_number = (
 6170                                parquet_hdr_vcf_header_infos[annotation_field].num
 6171                                or "."
 6172                            )
 6173                            parquet_hdr_vcf_header_infos_type = (
 6174                                parquet_hdr_vcf_header_infos[annotation_field].type
 6175                                or "String"
 6176                            )
 6177                            parquet_hdr_vcf_header_infos_description = (
 6178                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6179                                or f"{annotation_field} description"
 6180                            )
 6181                            parquet_hdr_vcf_header_infos_source = (
 6182                                parquet_hdr_vcf_header_infos[annotation_field].source
 6183                                or "unknown"
 6184                            )
 6185                            parquet_hdr_vcf_header_infos_version = (
 6186                                parquet_hdr_vcf_header_infos[annotation_field].version
 6187                                or "unknown"
 6188                            )
 6189
 6190                            vcf_reader.infos[annotation_fields_new_name] = (
 6191                                vcf.parser._Info(
 6192                                    annotation_fields_new_name,
 6193                                    parquet_hdr_vcf_header_infos_number,
 6194                                    parquet_hdr_vcf_header_infos_type,
 6195                                    parquet_hdr_vcf_header_infos_description,
 6196                                    parquet_hdr_vcf_header_infos_source,
 6197                                    parquet_hdr_vcf_header_infos_version,
 6198                                    self.code_type_map[
 6199                                        parquet_hdr_vcf_header_infos_type
 6200                                    ],
 6201                                )
 6202                            )
 6203
 6204                            # Append
 6205                            if force_append_annotation:
 6206                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6207                            else:
 6208                                query_case_when_append = ""
 6209
 6210                            # Annotation/Update query fields
 6211                            # Found in INFO column
 6212                            if (
 6213                                annotation_field_column == "INFO"
 6214                                and "INFO" in parquet_hdr_vcf_header_columns
 6215                            ):
 6216                                sql_query_annotation_update_info_sets.append(
 6217                                    f"""
 6218                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6219                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6220                                        ELSE ''
 6221                                    END
 6222                                """
 6223                                )
 6224                            # Found in a specific column
 6225                            else:
 6226                                sql_query_annotation_update_info_sets.append(
 6227                                    f"""
 6228                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6229                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6230                                        ELSE ''
 6231                                    END
 6232                                """
 6233                                )
 6234                                sql_query_annotation_to_agregate.append(
 6235                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6236                                )
 6237
 6238                        # Not to annotate
 6239                        else:
 6240
 6241                            if force_update_annotation:
 6242                                annotation_message = "forced"
 6243                            else:
 6244                                annotation_message = "skipped"
 6245
 6246                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6247                                log.warning(
 6248                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6249                                )
 6250                            if annotation_fields_new_name in self.get_header().infos:
 6251                                log.warning(
 6252                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6253                                )
 6254
 6255                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6256                    # allow_annotation_full_info = True
 6257                    allow_annotation_full_info = not force_append_annotation
 6258
 6259                    if parquet_type in ["regions"]:
 6260                        allow_annotation_full_info = False
 6261
 6262                    if (
 6263                        allow_annotation_full_info
 6264                        and nb_annotation_field == len(annotation_fields)
 6265                        and annotation_fields_all
 6266                        and (
 6267                            "INFO" in parquet_hdr_vcf_header_columns
 6268                            and "INFO" in database.get_extra_columns()
 6269                        )
 6270                    ):
 6271                        log.debug("Column INFO annotation enabled")
 6272                        sql_query_annotation_update_info_sets = []
 6273                        sql_query_annotation_update_info_sets.append(
 6274                            f" table_parquet.INFO "
 6275                        )
 6276
 6277                    if sql_query_annotation_update_info_sets:
 6278
 6279                        # Annotate
 6280                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6281
 6282                        # Join query annotation update info sets for SQL
 6283                        sql_query_annotation_update_info_sets_sql = ",".join(
 6284                            sql_query_annotation_update_info_sets
 6285                        )
 6286
 6287                        # Check chromosomes list (and variants infos)
 6288                        sql_query_chromosomes = f"""
 6289                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6290                            FROM {table_variants} as table_variants
 6291                            GROUP BY table_variants."#CHROM"
 6292                            ORDER BY table_variants."#CHROM"
 6293                            """
 6294                        sql_query_chromosomes_df = self.conn.execute(
 6295                            sql_query_chromosomes
 6296                        ).df()
 6297                        sql_query_chromosomes_dict = {
 6298                            entry["CHROM"]: {
 6299                                "count": entry["count_variants"],
 6300                                "min": entry["min_variants"],
 6301                                "max": entry["max_variants"],
 6302                            }
 6303                            for index, entry in sql_query_chromosomes_df.iterrows()
 6304                        }
 6305
 6306                        # Init
 6307                        nb_of_query = 0
 6308                        nb_of_variant_annotated = 0
 6309                        query_dict = query_dict_remove
 6310
 6311                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6312                        for chrom in sql_query_chromosomes_dict:
 6313
 6314                            # Number of variant by chromosome
 6315                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6316                                chrom, {}
 6317                            ).get("count", 0)
 6318
 6319                            log.debug(
 6320                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6321                            )
 6322
 6323                            # Annotation with regions database
 6324                            if parquet_type in ["regions"]:
 6325                                sql_query_annotation_from_clause = f"""
 6326                                    FROM (
 6327                                        SELECT 
 6328                                            '{chrom}' AS \"#CHROM\",
 6329                                            table_variants_from.\"POS\" AS \"POS\",
 6330                                            {",".join(sql_query_annotation_to_agregate)}
 6331                                        FROM {table_variants} as table_variants_from
 6332                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6333                                            table_parquet_from."#CHROM" = '{chrom}'
 6334                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6335                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6336                                        )
 6337                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6338                                        GROUP BY table_variants_from.\"POS\"
 6339                                        )
 6340                                        as table_parquet
 6341                                """
 6342
 6343                                sql_query_annotation_where_clause = """
 6344                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6345                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6346                                """
 6347
 6348                            # Annotation with variants database
 6349                            else:
 6350                                sql_query_annotation_from_clause = f"""
 6351                                    FROM {parquet_file_link} as table_parquet
 6352                                """
 6353                                sql_query_annotation_where_clause = f"""
 6354                                    table_variants."#CHROM" = '{chrom}'
 6355                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6356                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6357                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6358                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6359                                """
 6360
 6361                            # Create update query
 6362                            sql_query_annotation_chrom_interval_pos = f"""
 6363                                UPDATE {table_variants} as table_variants
 6364                                    SET INFO = 
 6365                                        concat(
 6366                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6367                                                THEN table_variants.INFO
 6368                                                ELSE ''
 6369                                            END
 6370                                            ,
 6371                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6372                                                        AND (
 6373                                                        concat({sql_query_annotation_update_info_sets_sql})
 6374                                                        )
 6375                                                        NOT IN ('','.') 
 6376                                                    THEN ';'
 6377                                                    ELSE ''
 6378                                            END
 6379                                            ,
 6380                                            {sql_query_annotation_update_info_sets_sql}
 6381                                            )
 6382                                    {sql_query_annotation_from_clause}
 6383                                    WHERE {sql_query_annotation_where_clause}
 6384                                    ;
 6385                                """
 6386
 6387                            # Add update query to dict
 6388                            query_dict[
 6389                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6390                            ] = sql_query_annotation_chrom_interval_pos
 6391
 6392                        nb_of_query = len(query_dict)
 6393                        num_query = 0
 6394
 6395                        # SET max_expression_depth TO x
 6396                        self.conn.execute("SET max_expression_depth TO 10000")
 6397
 6398                        for query_name in query_dict:
 6399                            query = query_dict[query_name]
 6400                            num_query += 1
 6401                            log.info(
 6402                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6403                            )
 6404                            result = self.conn.execute(query)
 6405                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6406                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6407                            log.info(
 6408                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6409                            )
 6410
 6411                        log.info(
 6412                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6413                        )
 6414
 6415                    else:
 6416
 6417                        log.info(
 6418                            f"Annotation '{annotation_name}' - No Annotations available"
 6419                        )
 6420
 6421                    log.debug("Final header: " + str(vcf_reader.infos))
 6422
 6423        # Remove added columns
 6424        for added_column in added_columns:
 6425            self.drop_column(column=added_column)
 6426
 6427    def annotation_splice(self, threads: int = None) -> None:
 6428        """
 6429        This function annotate with snpEff
 6430
 6431        :param threads: The number of threads to use
 6432        :return: the value of the variable "return_value".
 6433        """
 6434
 6435        # DEBUG
 6436        log.debug("Start annotation with splice tools")
 6437
 6438        # Threads
 6439        if not threads:
 6440            threads = self.get_threads()
 6441        log.debug("Threads: " + str(threads))
 6442
 6443        # DEBUG
 6444        delete_tmp = True
 6445        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6446            delete_tmp = False
 6447            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6448
 6449        # Config
 6450        config = self.get_config()
 6451        log.debug("Config: " + str(config))
 6452        splice_config = config.get("tools", {}).get("splice", {})
 6453        if not splice_config:
 6454            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6455            msg_err = "No Splice tool config"
 6456            raise ValueError(msg_err)
 6457        log.debug(f"splice_config: {splice_config}")
 6458
 6459        # Config - Folders - Databases
 6460        databases_folders = (
 6461            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6462        )
 6463        log.debug("Databases annotations: " + str(databases_folders))
 6464
 6465        # Splice docker image
 6466        splice_docker_image = splice_config.get("docker").get("image")
 6467
 6468        # Pull splice image if it's not already there
 6469        if not check_docker_image_exists(splice_docker_image):
 6470            log.warning(
 6471                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6472            )
 6473            try:
 6474                command(f"docker pull {splice_config.get('docker').get('image')}")
 6475            except subprocess.CalledProcessError:
 6476                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6477                log.error(msg_err)
 6478                raise ValueError(msg_err)
 6479
 6480        # Config - splice databases
 6481        splice_databases = (
 6482            config.get("folders", {})
 6483            .get("databases", {})
 6484            .get("splice", DEFAULT_SPLICE_FOLDER)
 6485        )
 6486        splice_databases = full_path(splice_databases)
 6487
 6488        # Param
 6489        param = self.get_param()
 6490        log.debug("Param: " + str(param))
 6491
 6492        # Param
 6493        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6494        log.debug("Options: " + str(options))
 6495
 6496        # Data
 6497        table_variants = self.get_table_variants()
 6498
 6499        # Check if not empty
 6500        log.debug("Check if not empty")
 6501        sql_query_chromosomes = (
 6502            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6503        )
 6504        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6505            log.info("VCF empty")
 6506            return None
 6507
 6508        # Export in VCF
 6509        log.debug("Create initial file to annotate")
 6510
 6511        # Create output folder / work folder
 6512        if options.get("output_folder", ""):
 6513            output_folder = options.get("output_folder", "")
 6514            if not os.path.exists(output_folder):
 6515                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6516        else:
 6517            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6518            if not os.path.exists(output_folder):
 6519                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6520
 6521        if options.get("workdir", ""):
 6522            workdir = options.get("workdir", "")
 6523        else:
 6524            workdir = "/work"
 6525
 6526        # Create tmp VCF file
 6527        tmp_vcf = NamedTemporaryFile(
 6528            prefix=self.get_prefix(),
 6529            dir=output_folder,
 6530            suffix=".vcf",
 6531            delete=False,
 6532        )
 6533        tmp_vcf_name = tmp_vcf.name
 6534
 6535        # VCF header
 6536        header = self.get_header()
 6537
 6538        # Existing annotations
 6539        for vcf_annotation in self.get_header().infos:
 6540
 6541            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6542            log.debug(
 6543                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6544            )
 6545
 6546        # Memory limit
 6547        if config.get("memory", None):
 6548            memory_limit = config.get("memory", "8G").upper()
 6549            # upper()
 6550        else:
 6551            memory_limit = "8G"
 6552        log.debug(f"memory_limit: {memory_limit}")
 6553
 6554        # Check number of variants to annotate
 6555        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6556        where_clause_regex_spip = r"SPiP_\w+"
 6557        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6558        df_list_of_variants_to_annotate = self.get_query_to_df(
 6559            query=f""" SELECT * FROM variants {where_clause} """
 6560        )
 6561        if len(df_list_of_variants_to_annotate) == 0:
 6562            log.warning(
 6563                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6564            )
 6565            return None
 6566        else:
 6567            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6568
 6569        # Export VCF file
 6570        self.export_variant_vcf(
 6571            vcf_file=tmp_vcf_name,
 6572            remove_info=True,
 6573            add_samples=True,
 6574            index=False,
 6575            where_clause=where_clause,
 6576        )
 6577        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6578        if any(value for value in splice_config.values() if value is None):
 6579            log.warning("At least one splice config parameter is empty")
 6580            # exit annotation_splice
 6581            return None
 6582
 6583        # Params in splice nf
 6584        def check_values(dico: dict):
 6585            """
 6586            Ensure parameters for NF splice pipeline
 6587            """
 6588            for key, val in dico.items():
 6589                if key == "genome":
 6590                    if any(
 6591                        assemb in options.get("genome", {})
 6592                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6593                    ):
 6594                        yield f"--{key} hg19"
 6595                    elif any(
 6596                        assemb in options.get("genome", {})
 6597                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6598                    ):
 6599                        yield f"--{key} hg38"
 6600                elif (
 6601                    (isinstance(val, str) and val)
 6602                    or isinstance(val, int)
 6603                    or isinstance(val, bool)
 6604                ):
 6605                    yield f"--{key} {val}"
 6606
 6607        # Genome
 6608        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6609        options["genome"] = genome
 6610        # NF params
 6611        nf_params = []
 6612        # Add options
 6613        if options:
 6614            log.debug(options)
 6615            nf_params = list(check_values(options))
 6616            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6617        else:
 6618            log.debug("No NF params provided")
 6619        # Add threads
 6620        if "threads" not in options.keys():
 6621            nf_params.append(f"--threads {threads}")
 6622        # Genome path
 6623        genome_path = find_genome(
 6624            config.get("folders", {})
 6625            .get("databases", {})
 6626            .get("genomes", DEFAULT_GENOME_FOLDER),
 6627            file=f"{genome}.fa",
 6628        )
 6629        # Add genome path
 6630        if not genome_path:
 6631            raise ValueError(
 6632                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6633            )
 6634        else:
 6635            log.debug(f"Genome: {genome_path}")
 6636            nf_params.append(f"--genome_path {genome_path}")
 6637
 6638        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6639            """
 6640            Setting up updated databases for SPiP and SpliceAI
 6641            """
 6642
 6643            try:
 6644
 6645                # SpliceAI assembly transcriptome
 6646                spliceai_assembly = os.path.join(
 6647                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6648                    options.get("genome"),
 6649                    "transcriptome",
 6650                )
 6651                spip_assembly = options.get("genome")
 6652
 6653                spip = find(
 6654                    f"transcriptome_{spip_assembly}.RData",
 6655                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6656                )
 6657                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6658                log.debug(f"SPiP annotations: {spip}")
 6659                log.debug(f"SpliceAI annotations: {spliceai}")
 6660                if spip and spliceai:
 6661                    return [
 6662                        f"--spip_transcriptome {spip}",
 6663                        f"--spliceai_transcriptome {spliceai}",
 6664                    ]
 6665                else:
 6666                    log.warning(
 6667                        "Can't find splice databases in configuration, use annotations file from image"
 6668                    )
 6669            except TypeError:
 6670                log.warning(
 6671                    "Can't find splice databases in configuration, use annotations file from image"
 6672                )
 6673                return []
 6674
 6675        # Add options, check if transcriptome option have already beend provided
 6676        if (
 6677            "spip_transcriptome" not in nf_params
 6678            and "spliceai_transcriptome" not in nf_params
 6679        ):
 6680            splice_reference = splice_annotations(options, config)
 6681            if splice_reference:
 6682                nf_params.extend(splice_reference)
 6683        # nf_params.append(f"--output_folder {output_folder}")
 6684        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6685        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6686        log.debug(cmd)
 6687        splice_config["docker"]["command"] = cmd
 6688
 6689        # Ensure proxy is set
 6690        proxy = [
 6691            f"-e {var}={os.getenv(var)}"
 6692            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6693            if os.getenv(var) is not None
 6694        ]
 6695        docker_cmd = get_bin_command(
 6696            tool="splice",
 6697            bin_type="docker",
 6698            config=config,
 6699            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6700            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6701        )
 6702        # print(docker_cmd)
 6703        # exit()
 6704        # Docker debug
 6705        # if splice_config.get("rm_container"):
 6706        #     rm_container = "--rm"
 6707        # else:
 6708        #     rm_container = ""
 6709        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6710        log.debug(docker_cmd)
 6711        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6712        log.debug(res.stdout)
 6713        if res.stderr:
 6714            log.error(res.stderr)
 6715        res.check_returncode()
 6716        # Update variants
 6717        log.info("Annotation - Updating...")
 6718        # Test find output vcf
 6719        log.debug(
 6720            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6721        )
 6722        output_vcf = []
 6723        # Wrong folder to look in
 6724        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6725            if (
 6726                files
 6727                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6728            ):
 6729                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6730        # log.debug(os.listdir(options.get("output_folder")))
 6731        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6732        if not output_vcf:
 6733            log.debug(
 6734                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6735            )
 6736        else:
 6737            # Get new header from annotated vcf
 6738            log.debug(f"Initial header: {len(header.infos)} fields")
 6739            # Create new header with splice infos
 6740            new_vcf = Variants(input=output_vcf[0])
 6741            new_vcf_header = new_vcf.get_header().infos
 6742            for keys, infos in new_vcf_header.items():
 6743                if keys not in header.infos.keys():
 6744                    header.infos[keys] = infos
 6745            log.debug(f"New header: {len(header.infos)} fields")
 6746            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6747            self.update_from_vcf(output_vcf[0])
 6748
 6749        # Remove file
 6750        remove_if_exists(output_vcf)
 6751
 6752    ###
 6753    # Prioritization
 6754    ###
 6755
 6756    def get_config_default(self, name: str) -> dict:
 6757        """
 6758        The function `get_config_default` returns a dictionary containing default configurations for
 6759        various calculations and prioritizations.
 6760
 6761        :param name: The `get_config_default` function returns a dictionary containing default
 6762        configurations for different calculations and prioritizations. The `name` parameter is used to
 6763        specify which specific configuration to retrieve from the dictionary
 6764        :type name: str
 6765        :return: The function `get_config_default` returns a dictionary containing default configuration
 6766        settings for different calculations and prioritizations. The specific configuration settings are
 6767        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6768        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6769        returned. If there is no match, an empty dictionary is returned.
 6770        """
 6771
 6772        config_default = {
 6773            "calculations": {
 6774                "variant_chr_pos_alt_ref": {
 6775                    "type": "sql",
 6776                    "name": "variant_chr_pos_alt_ref",
 6777                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6778                    "available": False,
 6779                    "output_column_name": "variant_chr_pos_alt_ref",
 6780                    "output_column_type": "String",
 6781                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6782                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6783                    "operation_info": True,
 6784                },
 6785                "VARTYPE": {
 6786                    "type": "sql",
 6787                    "name": "VARTYPE",
 6788                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6789                    "available": True,
 6790                    "table": "variants",
 6791                    "output_column_name": "VARTYPE",
 6792                    "output_column_type": "String",
 6793                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6794                    "operation_query": """
 6795                            CASE
 6796                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6797                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6798                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6799                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6800                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6801                                ELSE 'UNDEFINED'
 6802                            END
 6803                            """,
 6804                    "info_fields": ["SVTYPE"],
 6805                    "operation_info": True,
 6806                },
 6807                "snpeff_hgvs": {
 6808                    "type": "python",
 6809                    "name": "snpeff_hgvs",
 6810                    "description": "HGVS nomenclatures from snpEff annotation",
 6811                    "available": True,
 6812                    "function_name": "calculation_extract_snpeff_hgvs",
 6813                    "function_params": ["snpeff_hgvs", "ANN"],
 6814                },
 6815                "snpeff_ann_explode": {
 6816                    "type": "python",
 6817                    "name": "snpeff_ann_explode",
 6818                    "description": "Explode snpEff annotations with uniquify values",
 6819                    "available": True,
 6820                    "function_name": "calculation_snpeff_ann_explode",
 6821                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6822                },
 6823                "snpeff_ann_explode_uniquify": {
 6824                    "type": "python",
 6825                    "name": "snpeff_ann_explode_uniquify",
 6826                    "description": "Explode snpEff annotations",
 6827                    "available": True,
 6828                    "function_name": "calculation_snpeff_ann_explode",
 6829                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6830                },
 6831                "snpeff_ann_explode_json": {
 6832                    "type": "python",
 6833                    "name": "snpeff_ann_explode_json",
 6834                    "description": "Explode snpEff annotations in JSON format",
 6835                    "available": True,
 6836                    "function_name": "calculation_snpeff_ann_explode",
 6837                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6838                },
 6839                "NOMEN": {
 6840                    "type": "python",
 6841                    "name": "NOMEN",
 6842                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6843                    "available": True,
 6844                    "function_name": "calculation_extract_nomen",
 6845                    "function_params": [],
 6846                },
 6847                "RENAME_INFO_FIELDS": {
 6848                    "type": "python",
 6849                    "name": "RENAME_INFO_FIELDS",
 6850                    "description": "Rename or remove INFO/tags",
 6851                    "available": True,
 6852                    "function_name": "calculation_rename_info_fields",
 6853                    "function_params": [],
 6854                },
 6855                "FINDBYPIPELINE": {
 6856                    "type": "python",
 6857                    "name": "FINDBYPIPELINE",
 6858                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6859                    "available": True,
 6860                    "function_name": "calculation_find_by_pipeline",
 6861                    "function_params": ["findbypipeline"],
 6862                },
 6863                "FINDBYSAMPLE": {
 6864                    "type": "python",
 6865                    "name": "FINDBYSAMPLE",
 6866                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6867                    "available": True,
 6868                    "function_name": "calculation_find_by_pipeline",
 6869                    "function_params": ["findbysample"],
 6870                },
 6871                "GENOTYPECONCORDANCE": {
 6872                    "type": "python",
 6873                    "name": "GENOTYPECONCORDANCE",
 6874                    "description": "Concordance of genotype for multi caller VCF",
 6875                    "available": True,
 6876                    "function_name": "calculation_genotype_concordance",
 6877                    "function_params": [],
 6878                },
 6879                "BARCODE": {
 6880                    "type": "python",
 6881                    "name": "BARCODE",
 6882                    "description": "BARCODE as VaRank tool",
 6883                    "available": True,
 6884                    "function_name": "calculation_barcode",
 6885                    "function_params": [],
 6886                },
 6887                "BARCODEFAMILY": {
 6888                    "type": "python",
 6889                    "name": "BARCODEFAMILY",
 6890                    "description": "BARCODEFAMILY as VaRank tool",
 6891                    "available": True,
 6892                    "function_name": "calculation_barcode_family",
 6893                    "function_params": ["BCF"],
 6894                },
 6895                "TRIO": {
 6896                    "type": "python",
 6897                    "name": "TRIO",
 6898                    "description": "Inheritance for a trio family",
 6899                    "available": True,
 6900                    "function_name": "calculation_trio",
 6901                    "function_params": [],
 6902                },
 6903                "VAF": {
 6904                    "type": "python",
 6905                    "name": "VAF",
 6906                    "description": "Variant Allele Frequency (VAF) harmonization",
 6907                    "available": True,
 6908                    "function_name": "calculation_vaf_normalization",
 6909                    "function_params": [],
 6910                },
 6911                "VAF_stats": {
 6912                    "type": "python",
 6913                    "name": "VAF_stats",
 6914                    "description": "Variant Allele Frequency (VAF) statistics",
 6915                    "available": True,
 6916                    "function_name": "calculation_genotype_stats",
 6917                    "function_params": ["VAF"],
 6918                },
 6919                "DP_stats": {
 6920                    "type": "python",
 6921                    "name": "DP_stats",
 6922                    "description": "Depth (DP) statistics",
 6923                    "available": True,
 6924                    "function_name": "calculation_genotype_stats",
 6925                    "function_params": ["DP"],
 6926                },
 6927                "variant_id": {
 6928                    "type": "python",
 6929                    "name": "variant_id",
 6930                    "description": "Variant ID generated from variant position and type",
 6931                    "available": True,
 6932                    "function_name": "calculation_variant_id",
 6933                    "function_params": [],
 6934                },
 6935                "transcripts_json": {
 6936                    "type": "python",
 6937                    "name": "transcripts_json",
 6938                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6939                    "available": True,
 6940                    "function_name": "calculation_transcripts_annotation",
 6941                    "function_params": ["transcripts_json", None],
 6942                },
 6943                "transcripts_ann": {
 6944                    "type": "python",
 6945                    "name": "transcripts_ann",
 6946                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6947                    "available": True,
 6948                    "function_name": "calculation_transcripts_annotation",
 6949                    "function_params": [None, "transcripts_ann"],
 6950                },
 6951                "transcripts_annotations": {
 6952                    "type": "python",
 6953                    "name": "transcripts_annotations",
 6954                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6955                    "available": True,
 6956                    "function_name": "calculation_transcripts_annotation",
 6957                    "function_params": [None, None],
 6958                },
 6959                "transcripts_prioritization": {
 6960                    "type": "python",
 6961                    "name": "transcripts_prioritization",
 6962                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6963                    "available": True,
 6964                    "function_name": "calculation_transcripts_prioritization",
 6965                    "function_params": [],
 6966                },
 6967                "transcripts_export": {
 6968                    "type": "python",
 6969                    "name": "transcripts_export",
 6970                    "description": "Export transcripts table/view as a file (using param.json)",
 6971                    "available": True,
 6972                    "function_name": "calculation_transcripts_export",
 6973                    "function_params": [],
 6974                },
 6975            },
 6976            "prioritizations": {
 6977                "default": {
 6978                    "ANN2": [
 6979                        {
 6980                            "type": "contains",
 6981                            "value": "HIGH",
 6982                            "score": 5,
 6983                            "flag": "PASS",
 6984                            "comment": [
 6985                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6986                            ],
 6987                        },
 6988                        {
 6989                            "type": "contains",
 6990                            "value": "MODERATE",
 6991                            "score": 3,
 6992                            "flag": "PASS",
 6993                            "comment": [
 6994                                "A non-disruptive variant that might change protein effectiveness"
 6995                            ],
 6996                        },
 6997                        {
 6998                            "type": "contains",
 6999                            "value": "LOW",
 7000                            "score": 0,
 7001                            "flag": "FILTERED",
 7002                            "comment": [
 7003                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7004                            ],
 7005                        },
 7006                        {
 7007                            "type": "contains",
 7008                            "value": "MODIFIER",
 7009                            "score": 0,
 7010                            "flag": "FILTERED",
 7011                            "comment": [
 7012                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7013                            ],
 7014                        },
 7015                    ],
 7016                }
 7017            },
 7018        }
 7019
 7020        return config_default.get(name, None)
 7021
 7022    def get_config_json(
 7023        self, name: str, config_dict: dict = {}, config_file: str = None
 7024    ) -> dict:
 7025        """
 7026        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7027        default values, a dictionary, and a file.
 7028
 7029        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7030        the name of the configuration. It is used to identify and retrieve the configuration settings
 7031        for a specific component or module
 7032        :type name: str
 7033        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7034        dictionary that allows you to provide additional configuration settings or overrides. When you
 7035        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7036        the key is the configuration setting you want to override or
 7037        :type config_dict: dict
 7038        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7039        specify the path to a configuration file that contains additional settings. If provided, the
 7040        function will read the contents of this file and update the configuration dictionary with the
 7041        values found in the file, overriding any existing values with the
 7042        :type config_file: str
 7043        :return: The function `get_config_json` returns a dictionary containing the configuration
 7044        settings.
 7045        """
 7046
 7047        # Create with default prioritizations
 7048        config_default = self.get_config_default(name=name)
 7049        configuration = config_default
 7050        # log.debug(f"configuration={configuration}")
 7051
 7052        # Replace prioritizations from dict
 7053        for config in config_dict:
 7054            configuration[config] = config_dict[config]
 7055
 7056        # Replace prioritizations from file
 7057        config_file = full_path(config_file)
 7058        if config_file:
 7059            if os.path.exists(config_file):
 7060                with open(config_file) as config_file_content:
 7061                    config_file_dict = yaml.safe_load(config_file_content)
 7062                for config in config_file_dict:
 7063                    configuration[config] = config_file_dict[config]
 7064            else:
 7065                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7066                log.error(msg_error)
 7067                raise ValueError(msg_error)
 7068
 7069        return configuration
 7070
 7071    def prioritization(
 7072        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7073    ) -> bool:
 7074        """
 7075        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7076        prioritizes variants based on configured profiles and criteria.
 7077
 7078        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7079        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7080        a table name is provided, the method will prioritize the variants in that specific table
 7081        :type table: str
 7082        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7083        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7084        provided, the code will use a default prefix value of "PZ"
 7085        :type pz_prefix: str
 7086        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7087        additional parameters specific to the prioritization process. These parameters can include
 7088        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7089        configurations needed for the prioritization of variants in a V
 7090        :type pz_param: dict
 7091        :return: A boolean value (True) is being returned from the `prioritization` function.
 7092        """
 7093
 7094        # Config
 7095        config = self.get_config()
 7096
 7097        # Param
 7098        param = self.get_param()
 7099
 7100        # Prioritization param
 7101        if pz_param is not None:
 7102            prioritization_param = pz_param
 7103        else:
 7104            prioritization_param = param.get("prioritization", {})
 7105
 7106        # Configuration profiles
 7107        prioritization_config_file = prioritization_param.get(
 7108            "prioritization_config", None
 7109        )
 7110        prioritization_config_file = full_path(prioritization_config_file)
 7111        prioritizations_config = self.get_config_json(
 7112            name="prioritizations", config_file=prioritization_config_file
 7113        )
 7114
 7115        # Prioritization prefix
 7116        pz_prefix_default = "PZ"
 7117        if pz_prefix is None:
 7118            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7119
 7120        # Prioritization options
 7121        profiles = prioritization_param.get("profiles", [])
 7122        if isinstance(profiles, str):
 7123            profiles = profiles.split(",")
 7124        pzfields = prioritization_param.get(
 7125            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7126        )
 7127        if isinstance(pzfields, str):
 7128            pzfields = pzfields.split(",")
 7129        default_profile = prioritization_param.get("default_profile", None)
 7130        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7131        prioritization_score_mode = prioritization_param.get(
 7132            "prioritization_score_mode", "HOWARD"
 7133        )
 7134
 7135        # Quick Prioritizations
 7136        prioritizations = param.get("prioritizations", None)
 7137        if prioritizations:
 7138            log.info("Quick Prioritization:")
 7139            for profile in prioritizations.split(","):
 7140                if profile not in profiles:
 7141                    profiles.append(profile)
 7142                    log.info(f"   {profile}")
 7143
 7144        # If profile "ALL" provided, all profiles in the config profiles
 7145        if "ALL" in profiles:
 7146            profiles = list(prioritizations_config.keys())
 7147
 7148        for profile in profiles:
 7149            if prioritizations_config.get(profile, None):
 7150                log.debug(f"Profile '{profile}' configured")
 7151            else:
 7152                msg_error = f"Profile '{profile}' NOT configured"
 7153                log.error(msg_error)
 7154                raise ValueError(msg_error)
 7155
 7156        if profiles:
 7157            log.info(f"Prioritization... ")
 7158        else:
 7159            log.debug(f"No profile defined")
 7160            return False
 7161
 7162        if not default_profile and len(profiles):
 7163            default_profile = profiles[0]
 7164
 7165        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7166        log.debug("Profiles to check: " + str(list(profiles)))
 7167
 7168        # Variables
 7169        if table is not None:
 7170            table_variants = table
 7171        else:
 7172            table_variants = self.get_table_variants(clause="update")
 7173        log.debug(f"Table to prioritize: {table_variants}")
 7174
 7175        # Added columns
 7176        added_columns = []
 7177
 7178        # Create list of PZfields
 7179        # List of PZFields
 7180        list_of_pzfields_original = pzfields + [
 7181            pzfield + pzfields_sep + profile
 7182            for pzfield in pzfields
 7183            for profile in profiles
 7184        ]
 7185        list_of_pzfields = []
 7186        log.debug(f"{list_of_pzfields_original}")
 7187
 7188        # Remove existing PZfields to use if exists
 7189        for pzfield in list_of_pzfields_original:
 7190            if self.get_header().infos.get(pzfield, None) is None:
 7191                list_of_pzfields.append(pzfield)
 7192                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7193            else:
 7194                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7195
 7196        if list_of_pzfields:
 7197
 7198            # Explode Infos prefix
 7199            explode_infos_prefix = self.get_explode_infos_prefix()
 7200
 7201            # PZfields tags description
 7202            PZfields_INFOS = {
 7203                f"{pz_prefix}Tags": {
 7204                    "ID": f"{pz_prefix}Tags",
 7205                    "Number": ".",
 7206                    "Type": "String",
 7207                    "Description": "Variant tags based on annotation criteria",
 7208                },
 7209                f"{pz_prefix}Score": {
 7210                    "ID": f"{pz_prefix}Score",
 7211                    "Number": 1,
 7212                    "Type": "Integer",
 7213                    "Description": "Variant score based on annotation criteria",
 7214                },
 7215                f"{pz_prefix}Flag": {
 7216                    "ID": f"{pz_prefix}Flag",
 7217                    "Number": 1,
 7218                    "Type": "String",
 7219                    "Description": "Variant flag based on annotation criteria",
 7220                },
 7221                f"{pz_prefix}Comment": {
 7222                    "ID": f"{pz_prefix}Comment",
 7223                    "Number": ".",
 7224                    "Type": "String",
 7225                    "Description": "Variant comment based on annotation criteria",
 7226                },
 7227                f"{pz_prefix}Infos": {
 7228                    "ID": f"{pz_prefix}Infos",
 7229                    "Number": ".",
 7230                    "Type": "String",
 7231                    "Description": "Variant infos based on annotation criteria",
 7232                },
 7233                f"{pz_prefix}Class": {
 7234                    "ID": f"{pz_prefix}Class",
 7235                    "Number": ".",
 7236                    "Type": "String",
 7237                    "Description": "Variant class based on annotation criteria",
 7238                },
 7239            }
 7240
 7241            # Create INFO fields if not exist
 7242            for field in PZfields_INFOS:
 7243                field_ID = PZfields_INFOS[field]["ID"]
 7244                field_description = PZfields_INFOS[field]["Description"]
 7245                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7246                    field_description = (
 7247                        PZfields_INFOS[field]["Description"]
 7248                        + f", profile {default_profile}"
 7249                    )
 7250                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7251                        field_ID,
 7252                        PZfields_INFOS[field]["Number"],
 7253                        PZfields_INFOS[field]["Type"],
 7254                        field_description,
 7255                        "unknown",
 7256                        "unknown",
 7257                        code_type_map[PZfields_INFOS[field]["Type"]],
 7258                    )
 7259
 7260            # Create INFO fields if not exist for each profile
 7261            for profile in prioritizations_config:
 7262                if profile in profiles or profiles == []:
 7263                    for field in PZfields_INFOS:
 7264                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7265                        field_description = (
 7266                            PZfields_INFOS[field]["Description"]
 7267                            + f", profile {profile}"
 7268                        )
 7269                        if (
 7270                            field_ID not in self.get_header().infos
 7271                            and field in pzfields
 7272                        ):
 7273                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7274                                field_ID,
 7275                                PZfields_INFOS[field]["Number"],
 7276                                PZfields_INFOS[field]["Type"],
 7277                                field_description,
 7278                                "unknown",
 7279                                "unknown",
 7280                                code_type_map[PZfields_INFOS[field]["Type"]],
 7281                            )
 7282
 7283            # Header
 7284            for pzfield in list_of_pzfields:
 7285                if re.match(f"{pz_prefix}Score.*", pzfield):
 7286                    added_column = self.add_column(
 7287                        table_name=table_variants,
 7288                        column_name=pzfield,
 7289                        column_type="INTEGER",
 7290                        default_value="0",
 7291                    )
 7292                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7293                    added_column = self.add_column(
 7294                        table_name=table_variants,
 7295                        column_name=pzfield,
 7296                        column_type="BOOLEAN",
 7297                        default_value="1",
 7298                    )
 7299                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7300                    added_column = self.add_column(
 7301                        table_name=table_variants,
 7302                        column_name=pzfield,
 7303                        column_type="VARCHAR[]",
 7304                        default_value="null",
 7305                    )
 7306                else:
 7307                    added_column = self.add_column(
 7308                        table_name=table_variants,
 7309                        column_name=pzfield,
 7310                        column_type="STRING",
 7311                        default_value="''",
 7312                    )
 7313                added_columns.append(added_column)
 7314
 7315            # Profiles
 7316            if profiles:
 7317
 7318                # foreach profile in configuration file
 7319                for profile in prioritizations_config:
 7320
 7321                    # If profile is asked in param, or ALL are asked (empty profile [])
 7322                    if profile in profiles or profiles == []:
 7323                        log.info(f"Profile '{profile}'")
 7324
 7325                        sql_set_info_option = ""
 7326
 7327                        sql_set_info = []
 7328
 7329                        # PZ fields set
 7330
 7331                        # PZScore
 7332                        if (
 7333                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7334                            in list_of_pzfields
 7335                        ):
 7336                            sql_set_info.append(
 7337                                f"""
 7338                                    concat(
 7339                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7340                                        {pz_prefix}Score{pzfields_sep}{profile}
 7341                                    ) 
 7342                                """
 7343                            )
 7344                            if (
 7345                                profile == default_profile
 7346                                and f"{pz_prefix}Score" in list_of_pzfields
 7347                            ):
 7348                                sql_set_info.append(
 7349                                    f"""
 7350                                        concat(
 7351                                            '{pz_prefix}Score=',
 7352                                            {pz_prefix}Score{pzfields_sep}{profile}
 7353                                        )
 7354                                    """
 7355                                )
 7356
 7357                        # PZFlag
 7358                        if (
 7359                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7360                            in list_of_pzfields
 7361                        ):
 7362                            sql_set_info.append(
 7363                                f"""
 7364                                    concat(
 7365                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7366                                        CASE 
 7367                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7368                                            THEN 'PASS'
 7369                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7370                                            THEN 'FILTERED'
 7371                                        END
 7372                                    ) 
 7373                                """
 7374                            )
 7375                            if (
 7376                                profile == default_profile
 7377                                and f"{pz_prefix}Flag" in list_of_pzfields
 7378                            ):
 7379                                sql_set_info.append(
 7380                                    f"""
 7381                                        concat(
 7382                                            '{pz_prefix}Flag=',
 7383                                            CASE 
 7384                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7385                                                THEN 'PASS'
 7386                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7387                                                THEN 'FILTERED'
 7388                                            END
 7389                                        )
 7390                                    """
 7391                                )
 7392
 7393                        # PZClass
 7394                        if (
 7395                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7396                            in list_of_pzfields
 7397                        ):
 7398                            sql_set_info.append(
 7399                                f"""
 7400                                    concat(
 7401                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7402                                        CASE
 7403                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7404                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7405                                            ELSE '.'
 7406                                        END 
 7407                                    )
 7408                                    
 7409                                """
 7410                            )
 7411                            if (
 7412                                profile == default_profile
 7413                                and f"{pz_prefix}Class" in list_of_pzfields
 7414                            ):
 7415                                sql_set_info.append(
 7416                                    f"""
 7417                                        concat(
 7418                                            '{pz_prefix}Class=',
 7419                                            CASE
 7420                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7421                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7422                                                ELSE '.'
 7423                                            END 
 7424                                        )
 7425                                    """
 7426                                )
 7427
 7428                        # PZComment
 7429                        if (
 7430                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7431                            in list_of_pzfields
 7432                        ):
 7433                            sql_set_info.append(
 7434                                f"""
 7435                                    CASE
 7436                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7437                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7438                                        ELSE ''
 7439                                    END
 7440                                """
 7441                            )
 7442                            if (
 7443                                profile == default_profile
 7444                                and f"{pz_prefix}Comment" in list_of_pzfields
 7445                            ):
 7446                                sql_set_info.append(
 7447                                    f"""
 7448                                        CASE
 7449                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7450                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7451                                            ELSE ''
 7452                                        END
 7453                                    """
 7454                                )
 7455
 7456                        # PZInfos
 7457                        if (
 7458                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7459                            in list_of_pzfields
 7460                        ):
 7461                            sql_set_info.append(
 7462                                f"""
 7463                                    CASE
 7464                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7465                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7466                                        ELSE ''
 7467                                    END
 7468                                """
 7469                            )
 7470                            if (
 7471                                profile == default_profile
 7472                                and f"{pz_prefix}Infos" in list_of_pzfields
 7473                            ):
 7474                                sql_set_info.append(
 7475                                    f"""
 7476                                        CASE
 7477                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7478                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7479                                            ELSE ''
 7480                                        END
 7481                                    """
 7482                                )
 7483
 7484                        # Merge PZfields
 7485                        sql_set_info_option = ""
 7486                        sql_set_sep = ""
 7487                        for sql_set in sql_set_info:
 7488                            if sql_set_sep:
 7489                                sql_set_info_option += f"""
 7490                                    , concat('{sql_set_sep}', {sql_set})
 7491                                """
 7492                            else:
 7493                                sql_set_info_option += f"""
 7494                                    , {sql_set}
 7495                                """
 7496                            sql_set_sep = ";"
 7497
 7498                        sql_queries = []
 7499                        for annotation in prioritizations_config[profile]:
 7500
 7501                            # skip special sections
 7502                            if annotation.startswith("_"):
 7503                                continue
 7504
 7505                            # For each criterions
 7506                            for criterion in prioritizations_config[profile][
 7507                                annotation
 7508                            ]:
 7509
 7510                                # Criterion mode
 7511                                criterion_mode = None
 7512                                if np.any(
 7513                                    np.isin(list(criterion.keys()), ["type", "value"])
 7514                                ):
 7515                                    criterion_mode = "operation"
 7516                                elif np.any(
 7517                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7518                                ):
 7519                                    criterion_mode = "sql"
 7520                                log.debug(f"Criterion Mode: {criterion_mode}")
 7521
 7522                                # Criterion parameters
 7523                                criterion_type = criterion.get("type", None)
 7524                                criterion_value = criterion.get("value", None)
 7525                                criterion_sql = criterion.get("sql", None)
 7526                                criterion_fields = criterion.get("fields", None)
 7527                                criterion_score = criterion.get("score", 0)
 7528                                criterion_flag = criterion.get("flag", "PASS")
 7529                                criterion_class = criterion.get("class", None)
 7530                                criterion_flag_bool = criterion_flag == "PASS"
 7531                                criterion_comment = (
 7532                                    ", ".join(criterion.get("comment", []))
 7533                                    .replace("'", "''")
 7534                                    .replace(";", ",")
 7535                                    .replace("\t", " ")
 7536                                )
 7537                                criterion_infos = (
 7538                                    str(criterion)
 7539                                    .replace("'", "''")
 7540                                    .replace(";", ",")
 7541                                    .replace("\t", " ")
 7542                                )
 7543
 7544                                # SQL
 7545                                if criterion_sql is not None and isinstance(
 7546                                    criterion_sql, list
 7547                                ):
 7548                                    criterion_sql = " ".join(criterion_sql)
 7549
 7550                                # Fields and explode
 7551                                if criterion_fields is None:
 7552                                    criterion_fields = [annotation]
 7553                                if not isinstance(criterion_fields, list):
 7554                                    criterion_fields = str(criterion_fields).split(",")
 7555
 7556                                # Class
 7557                                if criterion_class is not None and not isinstance(
 7558                                    criterion_class, list
 7559                                ):
 7560                                    criterion_class = str(criterion_class).split(",")
 7561
 7562                                for annotation_field in criterion_fields:
 7563
 7564                                    # Explode specific annotation
 7565                                    log.debug(
 7566                                        f"Explode annotation '{annotation_field}'"
 7567                                    )
 7568                                    added_columns += self.explode_infos(
 7569                                        prefix=explode_infos_prefix,
 7570                                        fields=[annotation_field],
 7571                                        table=table_variants,
 7572                                    )
 7573                                    extra_infos = self.get_extra_infos(
 7574                                        table=table_variants
 7575                                    )
 7576
 7577                                    # Check if annotation field is present
 7578                                    if (
 7579                                        f"{explode_infos_prefix}{annotation_field}"
 7580                                        not in extra_infos
 7581                                    ):
 7582                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7583                                        log.error(msq_err)
 7584                                        raise ValueError(msq_err)
 7585                                    else:
 7586                                        log.debug(
 7587                                            f"Annotation '{annotation_field}' in data"
 7588                                        )
 7589
 7590                                sql_set = []
 7591                                sql_set_info = []
 7592
 7593                                # PZ fields set
 7594
 7595                                # PZScore
 7596                                if (
 7597                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7598                                    in list_of_pzfields
 7599                                ):
 7600                                    # VaRank prioritization score mode
 7601                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
 7602                                        sql_set.append(
 7603                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7604                                        )
 7605                                    # default HOWARD prioritization score mode
 7606                                    else:
 7607                                        sql_set.append(
 7608                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7609                                        )
 7610
 7611                                # PZFlag
 7612                                if (
 7613                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7614                                    in list_of_pzfields
 7615                                ):
 7616                                    sql_set.append(
 7617                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7618                                    )
 7619
 7620                                # PZClass
 7621                                if (
 7622                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7623                                    in list_of_pzfields
 7624                                    and criterion_class is not None
 7625                                ):
 7626                                    sql_set.append(
 7627                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7628                                    )
 7629
 7630                                # PZComment
 7631                                if (
 7632                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7633                                    in list_of_pzfields
 7634                                ):
 7635                                    sql_set.append(
 7636                                        f"""
 7637                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7638                                                concat(
 7639                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7640                                                    CASE 
 7641                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7642                                                        THEN ', '
 7643                                                        ELSE ''
 7644                                                    END,
 7645                                                    '{criterion_comment}'
 7646                                                )
 7647                                        """
 7648                                    )
 7649
 7650                                # PZInfos
 7651                                if (
 7652                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7653                                    in list_of_pzfields
 7654                                ):
 7655                                    sql_set.append(
 7656                                        f"""
 7657                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7658                                                concat(
 7659                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7660                                                    '{criterion_infos}'
 7661                                                )
 7662                                        """
 7663                                    )
 7664                                sql_set_option = ",".join(sql_set)
 7665
 7666                                # Criterion and comparison
 7667                                if sql_set_option:
 7668
 7669                                    if criterion_mode in ["operation"]:
 7670
 7671                                        try:
 7672                                            float(criterion_value)
 7673                                            sql_update = f"""
 7674                                                UPDATE {table_variants}
 7675                                                SET {sql_set_option}
 7676                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7677                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7678                                            """
 7679                                        except:
 7680                                            contains_option = ""
 7681                                            if criterion_type == "contains":
 7682                                                contains_option = ".*"
 7683                                            sql_update = f"""
 7684                                                UPDATE {table_variants}
 7685                                                SET {sql_set_option}
 7686                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7687                                            """
 7688                                        sql_queries.append(sql_update)
 7689
 7690                                    elif criterion_mode in ["sql"]:
 7691
 7692                                        sql_update = f"""
 7693                                            UPDATE {table_variants}
 7694                                            SET {sql_set_option}
 7695                                            WHERE {criterion_sql}
 7696                                        """
 7697                                        sql_queries.append(sql_update)
 7698
 7699                                    else:
 7700                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7701                                        log.error(msg_err)
 7702                                        raise ValueError(msg_err)
 7703
 7704                                else:
 7705                                    log.warning(
 7706                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7707                                    )
 7708
 7709                        # PZTags
 7710                        if (
 7711                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7712                            in list_of_pzfields
 7713                        ):
 7714
 7715                            # Create PZFalgs value
 7716                            pztags_value = ""
 7717                            pztags_sep_default = ","
 7718                            pztags_sep = ""
 7719                            for pzfield in pzfields:
 7720                                if pzfield not in [f"{pz_prefix}Tags"]:
 7721                                    if (
 7722                                        f"{pzfield}{pzfields_sep}{profile}"
 7723                                        in list_of_pzfields
 7724                                    ):
 7725                                        if pzfield in [f"{pz_prefix}Flag"]:
 7726                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7727                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7728                                                    THEN 'PASS'
 7729                                                    ELSE 'FILTERED'
 7730                                                END, '"""
 7731                                        elif pzfield in [f"{pz_prefix}Class"]:
 7732                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7733                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7734                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7735                                                    ELSE '.'
 7736                                                END, '"""
 7737                                        else:
 7738                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7739                                        pztags_sep = pztags_sep_default
 7740
 7741                            # Add Query update for PZFlags
 7742                            sql_update_pztags = f"""
 7743                                UPDATE {table_variants}
 7744                                SET INFO = concat(
 7745                                        INFO,
 7746                                        CASE WHEN INFO NOT in ('','.')
 7747                                                THEN ';'
 7748                                                ELSE ''
 7749                                        END,
 7750                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7751                                    )
 7752                                """
 7753                            sql_queries.append(sql_update_pztags)
 7754
 7755                            # Add Query update for PZFlags for default
 7756                            if profile == default_profile:
 7757                                sql_update_pztags_default = f"""
 7758                                UPDATE {table_variants}
 7759                                SET INFO = concat(
 7760                                        INFO,
 7761                                        ';',
 7762                                        '{pz_prefix}Tags={pztags_value}'
 7763                                    )
 7764                                """
 7765                                sql_queries.append(sql_update_pztags_default)
 7766
 7767                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7768
 7769                        if sql_queries:
 7770
 7771                            for sql_query in sql_queries:
 7772                                log.debug(
 7773                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7774                                )
 7775                                self.conn.execute(sql_query)
 7776
 7777                        log.info(f"""Profile '{profile}' - Update... """)
 7778                        sql_query_update = f"""
 7779                            UPDATE {table_variants}
 7780                            SET INFO =  
 7781                                concat(
 7782                                    CASE
 7783                                        WHEN INFO NOT IN ('','.')
 7784                                        THEN concat(INFO, ';')
 7785                                        ELSE ''
 7786                                    END
 7787                                    {sql_set_info_option}
 7788                                )
 7789                        """
 7790                        self.conn.execute(sql_query_update)
 7791
 7792        else:
 7793
 7794            log.warning(f"No profiles in parameters")
 7795
 7796        # Remove added columns
 7797        for added_column in added_columns:
 7798            self.drop_column(column=added_column)
 7799
 7800        # Explode INFOS fields into table fields
 7801        if self.get_explode_infos():
 7802            self.explode_infos(
 7803                prefix=self.get_explode_infos_prefix(),
 7804                fields=self.get_explode_infos_fields(),
 7805                force=True,
 7806            )
 7807
 7808        return True
 7809
 7810    ###
 7811    # HGVS
 7812    ###
 7813
 7814    def annotation_hgvs(self, threads: int = None) -> None:
 7815        """
 7816        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7817        coordinates and alleles.
 7818
 7819        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7820        threads to use for parallel processing. If no value is provided, it will default to the number
 7821        of threads obtained from the `get_threads()` method
 7822        :type threads: int
 7823        """
 7824
 7825        # Function for each partition of the Dask Dataframe
 7826        def partition_function(partition):
 7827            """
 7828            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7829            each row of a DataFrame called `partition`.
 7830
 7831            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7832            to be processed
 7833            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7834            the "partition" dataframe along the axis 1.
 7835            """
 7836            return partition.apply(annotation_hgvs_partition, axis=1)
 7837
 7838        def annotation_hgvs_partition(row) -> str:
 7839            """
 7840            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7841            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7842
 7843            :param row: A dictionary-like object that contains the values for the following keys:
 7844            :return: a string that contains the HGVS names associated with the given row of data.
 7845            """
 7846
 7847            chr = row["CHROM"]
 7848            pos = row["POS"]
 7849            ref = row["REF"]
 7850            alt = row["ALT"]
 7851
 7852            # Find list of associated transcripts
 7853            transcripts_list = list(
 7854                polars_conn.execute(
 7855                    f"""
 7856                SELECT transcript
 7857                FROM refseq_df
 7858                WHERE CHROM='{chr}'
 7859                AND POS={pos}
 7860            """
 7861                )["transcript"]
 7862            )
 7863
 7864            # Full HGVS annotation in list
 7865            hgvs_full_list = []
 7866
 7867            for transcript_name in transcripts_list:
 7868
 7869                # Transcript
 7870                transcript = get_transcript(
 7871                    transcripts=transcripts, transcript_name=transcript_name
 7872                )
 7873                # Exon
 7874                if use_exon:
 7875                    exon = transcript.find_exon_number(pos)
 7876                else:
 7877                    exon = None
 7878                # Protein
 7879                transcript_protein = None
 7880                if use_protein or add_protein or full_format:
 7881                    transcripts_protein = list(
 7882                        polars_conn.execute(
 7883                            f"""
 7884                        SELECT protein
 7885                        FROM refseqlink_df
 7886                        WHERE transcript='{transcript_name}'
 7887                        LIMIT 1
 7888                    """
 7889                        )["protein"]
 7890                    )
 7891                    if len(transcripts_protein):
 7892                        transcript_protein = transcripts_protein[0]
 7893
 7894                # HGVS name
 7895                hgvs_name = format_hgvs_name(
 7896                    chr,
 7897                    pos,
 7898                    ref,
 7899                    alt,
 7900                    genome=genome,
 7901                    transcript=transcript,
 7902                    transcript_protein=transcript_protein,
 7903                    exon=exon,
 7904                    use_gene=use_gene,
 7905                    use_protein=use_protein,
 7906                    full_format=full_format,
 7907                    use_version=use_version,
 7908                    codon_type=codon_type,
 7909                )
 7910                hgvs_full_list.append(hgvs_name)
 7911                if add_protein and not use_protein and not full_format:
 7912                    hgvs_name = format_hgvs_name(
 7913                        chr,
 7914                        pos,
 7915                        ref,
 7916                        alt,
 7917                        genome=genome,
 7918                        transcript=transcript,
 7919                        transcript_protein=transcript_protein,
 7920                        exon=exon,
 7921                        use_gene=use_gene,
 7922                        use_protein=True,
 7923                        full_format=False,
 7924                        use_version=use_version,
 7925                        codon_type=codon_type,
 7926                    )
 7927                    hgvs_full_list.append(hgvs_name)
 7928
 7929            # Create liste of HGVS annotations
 7930            hgvs_full = ",".join(hgvs_full_list)
 7931
 7932            return hgvs_full
 7933
 7934        # Polars connexion
 7935        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7936
 7937        # Config
 7938        config = self.get_config()
 7939
 7940        # Databases
 7941        # Genome
 7942        databases_genomes_folders = (
 7943            config.get("folders", {})
 7944            .get("databases", {})
 7945            .get("genomes", DEFAULT_GENOME_FOLDER)
 7946        )
 7947        databases_genome = (
 7948            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7949        )
 7950        # refseq database folder
 7951        databases_refseq_folders = (
 7952            config.get("folders", {})
 7953            .get("databases", {})
 7954            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7955        )
 7956        # refseq
 7957        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7958        # refSeqLink
 7959        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7960
 7961        # Param
 7962        param = self.get_param()
 7963
 7964        # Quick HGVS
 7965        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7966            log.info(f"Quick HGVS Annotation:")
 7967            if not param.get("hgvs", None):
 7968                param["hgvs"] = {}
 7969            for option in param.get("hgvs_options", "").split(","):
 7970                option_var_val = option.split("=")
 7971                option_var = option_var_val[0]
 7972                if len(option_var_val) > 1:
 7973                    option_val = option_var_val[1]
 7974                else:
 7975                    option_val = "True"
 7976                if option_val.upper() in ["TRUE"]:
 7977                    option_val = True
 7978                elif option_val.upper() in ["FALSE"]:
 7979                    option_val = False
 7980                log.info(f"   {option_var}={option_val}")
 7981                param["hgvs"][option_var] = option_val
 7982
 7983        # Check if HGVS annotation enabled
 7984        if "hgvs" in param:
 7985            log.info(f"HGVS Annotation... ")
 7986            for hgvs_option in param.get("hgvs", {}):
 7987                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7988        else:
 7989            return
 7990
 7991        # HGVS Param
 7992        param_hgvs = param.get("hgvs", {})
 7993        use_exon = param_hgvs.get("use_exon", False)
 7994        use_gene = param_hgvs.get("use_gene", False)
 7995        use_protein = param_hgvs.get("use_protein", False)
 7996        add_protein = param_hgvs.get("add_protein", False)
 7997        full_format = param_hgvs.get("full_format", False)
 7998        use_version = param_hgvs.get("use_version", False)
 7999        codon_type = param_hgvs.get("codon_type", "3")
 8000
 8001        # refSseq refSeqLink
 8002        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8003        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8004
 8005        # Assembly
 8006        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8007
 8008        # Genome
 8009        genome_file = None
 8010        if find_genome(databases_genome):
 8011            genome_file = find_genome(databases_genome)
 8012        else:
 8013            genome_file = find_genome(
 8014                genome_path=databases_genomes_folders, assembly=assembly
 8015            )
 8016        log.debug("Genome: " + str(genome_file))
 8017
 8018        # refSseq
 8019        refseq_file = find_file_prefix(
 8020            input_file=databases_refseq,
 8021            prefix="ncbiRefSeq",
 8022            folder=databases_refseq_folders,
 8023            assembly=assembly,
 8024        )
 8025        log.debug("refSeq: " + str(refseq_file))
 8026
 8027        # refSeqLink
 8028        refseqlink_file = find_file_prefix(
 8029            input_file=databases_refseqlink,
 8030            prefix="ncbiRefSeqLink",
 8031            folder=databases_refseq_folders,
 8032            assembly=assembly,
 8033        )
 8034        log.debug("refSeqLink: " + str(refseqlink_file))
 8035
 8036        # Threads
 8037        if not threads:
 8038            threads = self.get_threads()
 8039        log.debug("Threads: " + str(threads))
 8040
 8041        # Variables
 8042        table_variants = self.get_table_variants(clause="update")
 8043
 8044        # Get variants SNV and InDel only
 8045        query_variants = f"""
 8046            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8047            FROM {table_variants}
 8048            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8049            """
 8050        df_variants = self.get_query_to_df(query_variants)
 8051
 8052        # Added columns
 8053        added_columns = []
 8054
 8055        # Add hgvs column in variants table
 8056        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8057        added_column = self.add_column(
 8058            table_variants, hgvs_column_name, "STRING", default_value=None
 8059        )
 8060        added_columns.append(added_column)
 8061
 8062        log.debug(f"refSeq loading...")
 8063        # refSeq in duckDB
 8064        refseq_table = get_refseq_table(
 8065            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8066        )
 8067        # Loading all refSeq in Dataframe
 8068        refseq_query = f"""
 8069            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8070            FROM {refseq_table}
 8071            JOIN df_variants ON (
 8072                {refseq_table}.chrom = df_variants.CHROM
 8073                AND {refseq_table}.txStart<=df_variants.POS
 8074                AND {refseq_table}.txEnd>=df_variants.POS
 8075            )
 8076        """
 8077        refseq_df = self.conn.query(refseq_query).pl()
 8078
 8079        if refseqlink_file:
 8080            log.debug(f"refSeqLink loading...")
 8081            # refSeqLink in duckDB
 8082            refseqlink_table = get_refseq_table(
 8083                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8084            )
 8085            # Loading all refSeqLink in Dataframe
 8086            protacc_column = "protAcc_with_ver"
 8087            mrnaacc_column = "mrnaAcc_with_ver"
 8088            refseqlink_query = f"""
 8089                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8090                FROM {refseqlink_table} 
 8091                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8092                WHERE protAcc_without_ver IS NOT NULL
 8093            """
 8094            # Polars Dataframe
 8095            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8096
 8097        # Read RefSeq transcripts into a python dict/model.
 8098        log.debug(f"Transcripts loading...")
 8099        with tempfile.TemporaryDirectory() as tmpdir:
 8100            transcripts_query = f"""
 8101                COPY (
 8102                    SELECT {refseq_table}.*
 8103                    FROM {refseq_table}
 8104                    JOIN df_variants ON (
 8105                        {refseq_table}.chrom=df_variants.CHROM
 8106                        AND {refseq_table}.txStart<=df_variants.POS
 8107                        AND {refseq_table}.txEnd>=df_variants.POS
 8108                    )
 8109                )
 8110                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8111            """
 8112            self.conn.query(transcripts_query)
 8113            with open(f"{tmpdir}/transcript.tsv") as infile:
 8114                transcripts = read_transcripts(infile)
 8115
 8116        # Polars connexion
 8117        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8118
 8119        log.debug("Genome loading...")
 8120        # Read genome sequence using pyfaidx.
 8121        genome = Fasta(genome_file)
 8122
 8123        log.debug("Start annotation HGVS...")
 8124
 8125        # Create
 8126        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8127        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8128
 8129        # Use dask.dataframe.apply() to apply function on each partition
 8130        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8131
 8132        # Convert Dask DataFrame to Pandas Dataframe
 8133        df = ddf.compute()
 8134
 8135        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8136        with tempfile.TemporaryDirectory() as tmpdir:
 8137            df_parquet = os.path.join(tmpdir, "df.parquet")
 8138            df.to_parquet(df_parquet)
 8139
 8140            # Update hgvs column
 8141            update_variant_query = f"""
 8142                UPDATE {table_variants}
 8143                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8144                FROM read_parquet('{df_parquet}') as df
 8145                WHERE variants."#CHROM" = df.CHROM
 8146                AND variants.POS = df.POS
 8147                AND variants.REF = df.REF
 8148                AND variants.ALT = df.ALT
 8149                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8150                """
 8151            self.execute_query(update_variant_query)
 8152
 8153        # Update INFO column
 8154        sql_query_update = f"""
 8155            UPDATE {table_variants}
 8156            SET INFO = 
 8157                concat(
 8158                    CASE 
 8159                        WHEN INFO NOT IN ('','.')
 8160                        THEN concat(INFO, ';')
 8161                        ELSE ''
 8162                    END,
 8163                    'hgvs=',
 8164                    {hgvs_column_name}
 8165                )
 8166            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8167            """
 8168        self.execute_query(sql_query_update)
 8169
 8170        # Add header
 8171        HGVS_INFOS = {
 8172            "hgvs": {
 8173                "ID": "hgvs",
 8174                "Number": ".",
 8175                "Type": "String",
 8176                "Description": f"HGVS annotatation with HOWARD",
 8177            }
 8178        }
 8179
 8180        for field in HGVS_INFOS:
 8181            field_ID = HGVS_INFOS[field]["ID"]
 8182            field_description = HGVS_INFOS[field]["Description"]
 8183            self.get_header().infos[field_ID] = vcf.parser._Info(
 8184                field_ID,
 8185                HGVS_INFOS[field]["Number"],
 8186                HGVS_INFOS[field]["Type"],
 8187                field_description,
 8188                "unknown",
 8189                "unknown",
 8190                code_type_map[HGVS_INFOS[field]["Type"]],
 8191            )
 8192
 8193        # Remove added columns
 8194        for added_column in added_columns:
 8195            self.drop_column(column=added_column)
 8196
 8197    ###
 8198    # Calculation
 8199    ###
 8200
 8201    def get_operations_help(
 8202        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8203    ) -> list:
 8204
 8205        # Init
 8206        operations_help = []
 8207
 8208        # operations
 8209        operations = self.get_config_json(
 8210            name="calculations",
 8211            config_dict=operations_config_dict,
 8212            config_file=operations_config_file,
 8213        )
 8214        for op in operations:
 8215            op_name = operations[op].get("name", op).upper()
 8216            op_description = operations[op].get("description", op_name)
 8217            op_available = operations[op].get("available", False)
 8218            if op_available:
 8219                operations_help.append(f"   {op_name}: {op_description}")
 8220
 8221        # Sort operations
 8222        operations_help.sort()
 8223
 8224        # insert header
 8225        operations_help.insert(0, "Available calculation operations:")
 8226
 8227        # Return
 8228        return operations_help
 8229
 8230    def calculation(
 8231        self,
 8232        operations: dict = {},
 8233        operations_config_dict: dict = {},
 8234        operations_config_file: str = None,
 8235    ) -> None:
 8236        """
 8237        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8238        operation, and then calls the appropriate function
 8239
 8240        param json example:
 8241            "calculation": {
 8242                "NOMEN": {
 8243                    "options": {
 8244                        "hgvs_field": "hgvs"
 8245                    },
 8246                "middle" : null
 8247            }
 8248        """
 8249
 8250        # Param
 8251        param = self.get_param()
 8252
 8253        # CHeck operations config file
 8254        if operations_config_file is None:
 8255            operations_config_file = param.get("calculation", {}).get(
 8256                "calculation_config", None
 8257            )
 8258
 8259        # operations config
 8260        operations_config = self.get_config_json(
 8261            name="calculations",
 8262            config_dict=operations_config_dict,
 8263            config_file=operations_config_file,
 8264        )
 8265
 8266        # Upper keys
 8267        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8268
 8269        # Calculations
 8270
 8271        # Operations from param
 8272        operations = param.get("calculation", {}).get("calculations", operations)
 8273
 8274        # Quick calculation - add
 8275        if param.get("calculations", None):
 8276
 8277            # List of operations
 8278            calculations_list = [
 8279                value.strip() for value in param.get("calculations", "").split(",")
 8280            ]
 8281
 8282            # Log
 8283            log.info(f"Quick Calculations:")
 8284            for calculation_key in calculations_list:
 8285                log.info(f"   {calculation_key}")
 8286
 8287            # Create tmp operations (to keep operation order)
 8288            operations_tmp = {}
 8289            for calculation_operation in calculations_list:
 8290                if calculation_operation.upper() not in operations_tmp:
 8291                    log.debug(
 8292                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8293                    )
 8294                    operations_tmp[calculation_operation.upper()] = {}
 8295                    add_value_into_dict(
 8296                        dict_tree=operations_tmp,
 8297                        sections=[
 8298                            calculation_operation.upper(),
 8299                        ],
 8300                        value=operations.get(calculation_operation.upper(), {}),
 8301                    )
 8302            # Add operations already in param
 8303            for calculation_operation in operations:
 8304                if calculation_operation not in operations_tmp:
 8305                    operations_tmp[calculation_operation] = operations.get(
 8306                        calculation_operation, {}
 8307                    )
 8308
 8309            # Update operations in param
 8310            operations = operations_tmp
 8311
 8312        # Operations for calculation
 8313        if not operations:
 8314            operations = param.get("calculation", {}).get("calculations", {})
 8315
 8316        if operations:
 8317            log.info(f"Calculations...")
 8318
 8319        # For each operations
 8320        for operation_name in operations:
 8321            operation_name = operation_name.upper()
 8322            if operation_name not in [""]:
 8323                if operation_name in operations_config:
 8324                    log.info(f"Calculation '{operation_name}'")
 8325                    operation = operations_config[operation_name]
 8326                    operation_type = operation.get("type", "sql")
 8327                    if operation_type == "python":
 8328                        self.calculation_process_function(
 8329                            operation=operation, operation_name=operation_name
 8330                        )
 8331                    elif operation_type == "sql":
 8332                        self.calculation_process_sql(
 8333                            operation=operation, operation_name=operation_name
 8334                        )
 8335                    else:
 8336                        log.error(
 8337                            f"Operations config: Type '{operation_type}' NOT available"
 8338                        )
 8339                        raise ValueError(
 8340                            f"Operations config: Type '{operation_type}' NOT available"
 8341                        )
 8342                else:
 8343                    log.error(
 8344                        f"Operations config: Calculation '{operation_name}' NOT available"
 8345                    )
 8346                    raise ValueError(
 8347                        f"Operations config: Calculation '{operation_name}' NOT available"
 8348                    )
 8349
 8350        # Explode INFOS fields into table fields
 8351        if self.get_explode_infos():
 8352            self.explode_infos(
 8353                prefix=self.get_explode_infos_prefix(),
 8354                fields=self.get_explode_infos_fields(),
 8355                force=True,
 8356            )
 8357
 8358    def calculation_process_sql(
 8359        self, operation: dict, operation_name: str = "unknown"
 8360    ) -> None:
 8361        """
 8362        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8363        performs the operation, updating the specified table with the result.
 8364
 8365        :param operation: The `operation` parameter is a dictionary that contains information about the
 8366        mathematical operation to be performed. It includes the following keys:
 8367        :type operation: dict
 8368        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8369        the mathematical operation being performed. It is used for logging and error handling purposes,
 8370        defaults to unknown
 8371        :type operation_name: str (optional)
 8372        """
 8373
 8374        # Operation infos
 8375        operation_name = operation.get("name", "unknown")
 8376        log.debug(f"process SQL {operation_name}")
 8377        output_column_name = operation.get("output_column_name", operation_name)
 8378        output_column_type = operation.get("output_column_type", "String")
 8379        prefix = operation.get("explode_infos_prefix", "")
 8380        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8381        output_column_description = operation.get(
 8382            "output_column_description", f"{operation_name} operation"
 8383        )
 8384        operation_query = operation.get("operation_query", None)
 8385        if isinstance(operation_query, list):
 8386            operation_query = " ".join(operation_query)
 8387        operation_info_fields = operation.get("info_fields", [])
 8388        operation_info_fields_check = operation.get("info_fields_check", False)
 8389        operation_info = operation.get("operation_info", True)
 8390        operation_table = operation.get(
 8391            "table", self.get_table_variants(clause="alter")
 8392        )
 8393
 8394        # table variants
 8395        if operation_table:
 8396            table_variants = operation_table
 8397        else:
 8398            table_variants = self.get_table_variants(clause="alter")
 8399
 8400        if operation_query:
 8401
 8402            # Info fields check
 8403            operation_info_fields_check_result = True
 8404            if operation_info_fields_check:
 8405                header_infos = self.get_header().infos
 8406                for info_field in operation_info_fields:
 8407                    operation_info_fields_check_result = (
 8408                        operation_info_fields_check_result
 8409                        and info_field in header_infos
 8410                    )
 8411
 8412            # If info fields available
 8413            if operation_info_fields_check_result:
 8414
 8415                # Added_columns
 8416                added_columns = []
 8417
 8418                # Create VCF header field
 8419                vcf_reader = self.get_header()
 8420                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8421                    output_column_name,
 8422                    ".",
 8423                    output_column_type,
 8424                    output_column_description,
 8425                    "howard calculation",
 8426                    "0",
 8427                    self.code_type_map.get(output_column_type),
 8428                )
 8429
 8430                # Explode infos if needed
 8431                log.debug(f"calculation_process_sql prefix {prefix}")
 8432                added_columns += self.explode_infos(
 8433                    prefix=prefix,
 8434                    fields=[output_column_name] + operation_info_fields,
 8435                    force=False,
 8436                    table=table_variants,
 8437                )
 8438
 8439                # Create column
 8440                added_column = self.add_column(
 8441                    table_name=table_variants,
 8442                    column_name=prefix + output_column_name,
 8443                    column_type=output_column_type_sql,
 8444                    default_value="null",
 8445                )
 8446                added_columns.append(added_column)
 8447
 8448                # Operation calculation
 8449                try:
 8450
 8451                    # Query to update calculation column
 8452                    sql_update = f"""
 8453                        UPDATE {table_variants}
 8454                        SET "{prefix}{output_column_name}" = ({operation_query})
 8455                    """
 8456                    self.conn.execute(sql_update)
 8457
 8458                    # Add to INFO
 8459                    if operation_info:
 8460                        sql_update_info = f"""
 8461                            UPDATE {table_variants}
 8462                            SET "INFO" =
 8463                                concat(
 8464                                    CASE
 8465                                        WHEN "INFO" IS NOT NULL
 8466                                        THEN concat("INFO", ';')
 8467                                        ELSE ''
 8468                                    END,
 8469                                    '{output_column_name}=',
 8470                                    "{prefix}{output_column_name}"
 8471                                )
 8472                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8473                        """
 8474                        self.conn.execute(sql_update_info)
 8475
 8476                except:
 8477                    log.error(
 8478                        f"Operations config: Calculation '{operation_name}' query failed"
 8479                    )
 8480                    raise ValueError(
 8481                        f"Operations config: Calculation '{operation_name}' query failed"
 8482                    )
 8483
 8484                # Remove added columns
 8485                for added_column in added_columns:
 8486                    log.debug(f"added_column: {added_column}")
 8487                    self.drop_column(column=added_column)
 8488
 8489            else:
 8490                log.error(
 8491                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8492                )
 8493                raise ValueError(
 8494                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8495                )
 8496
 8497        else:
 8498            log.error(
 8499                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8500            )
 8501            raise ValueError(
 8502                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8503            )
 8504
 8505    def calculation_process_function(
 8506        self, operation: dict, operation_name: str = "unknown"
 8507    ) -> None:
 8508        """
 8509        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8510        function with the given parameters.
 8511
 8512        :param operation: The `operation` parameter is a dictionary that contains information about the
 8513        operation to be performed. It has the following keys:
 8514        :type operation: dict
 8515        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8516        the operation being performed. It is used for logging purposes, defaults to unknown
 8517        :type operation_name: str (optional)
 8518        """
 8519
 8520        operation_name = operation["name"]
 8521        log.debug(f"process Python {operation_name}")
 8522        function_name = operation["function_name"]
 8523        function_params = operation["function_params"]
 8524        getattr(self, function_name)(*function_params)
 8525
 8526    def calculation_variant_id(self) -> None:
 8527        """
 8528        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8529        updates the INFO field of a variants table with the variant ID.
 8530        """
 8531
 8532        # variant_id annotation field
 8533        variant_id_tag = self.get_variant_id_column()
 8534        added_columns = [variant_id_tag]
 8535
 8536        # variant_id hgvs tags"
 8537        vcf_infos_tags = {
 8538            variant_id_tag: "howard variant ID annotation",
 8539        }
 8540
 8541        # Variants table
 8542        table_variants = self.get_table_variants()
 8543
 8544        # Header
 8545        vcf_reader = self.get_header()
 8546
 8547        # Add variant_id to header
 8548        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8549            variant_id_tag,
 8550            ".",
 8551            "String",
 8552            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8553            "howard calculation",
 8554            "0",
 8555            self.code_type_map.get("String"),
 8556        )
 8557
 8558        # Update
 8559        sql_update = f"""
 8560            UPDATE {table_variants}
 8561            SET "INFO" = 
 8562                concat(
 8563                    CASE
 8564                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8565                        THEN ''
 8566                        ELSE concat("INFO", ';')
 8567                    END,
 8568                    '{variant_id_tag}=',
 8569                    "{variant_id_tag}"
 8570                )
 8571        """
 8572        self.conn.execute(sql_update)
 8573
 8574        # Remove added columns
 8575        for added_column in added_columns:
 8576            self.drop_column(column=added_column)
 8577
 8578    def calculation_extract_snpeff_hgvs(
 8579        self,
 8580        snpeff_hgvs: str = "snpeff_hgvs",
 8581        snpeff_field: str = "ANN",
 8582    ) -> None:
 8583        """
 8584        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8585        annotation field in a VCF file and adds them as a new column in the variants table.
 8586
 8587        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8588        function is used to specify the name of the column that will store the HGVS nomenclatures
 8589        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8590        snpeff_hgvs
 8591        :type snpeff_hgvs: str (optional)
 8592        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8593        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8594        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8595        to ANN
 8596        :type snpeff_field: str (optional)
 8597        """
 8598
 8599        # Snpeff hgvs tags
 8600        vcf_infos_tags = {
 8601            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8602        }
 8603
 8604        # Prefix
 8605        prefix = self.get_explode_infos_prefix()
 8606        if prefix:
 8607            prefix = "INFO/"
 8608
 8609        # snpEff fields
 8610        speff_ann_infos = prefix + snpeff_field
 8611        speff_hgvs_infos = prefix + snpeff_hgvs
 8612
 8613        # Variants table
 8614        table_variants = self.get_table_variants()
 8615
 8616        # Header
 8617        vcf_reader = self.get_header()
 8618
 8619        # Add columns
 8620        added_columns = []
 8621
 8622        # Explode HGVS field in column
 8623        added_columns += self.explode_infos(fields=[snpeff_field])
 8624
 8625        if snpeff_field in vcf_reader.infos:
 8626
 8627            log.debug(vcf_reader.infos[snpeff_field])
 8628
 8629            # Extract ANN header
 8630            ann_description = vcf_reader.infos[snpeff_field].desc
 8631            pattern = r"'(.+?)'"
 8632            match = re.search(pattern, ann_description)
 8633            if match:
 8634                ann_header_match = match.group(1).split(" | ")
 8635                ann_header_desc = {}
 8636                for i in range(len(ann_header_match)):
 8637                    ann_header_info = "".join(
 8638                        char for char in ann_header_match[i] if char.isalnum()
 8639                    )
 8640                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8641                if not ann_header_desc:
 8642                    raise ValueError("Invalid header description format")
 8643            else:
 8644                raise ValueError("Invalid header description format")
 8645
 8646            # Create variant id
 8647            variant_id_column = self.get_variant_id_column()
 8648            added_columns += [variant_id_column]
 8649
 8650            # Create dataframe
 8651            dataframe_snpeff_hgvs = self.get_query_to_df(
 8652                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8653            )
 8654
 8655            # Create main NOMEN column
 8656            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8657                speff_ann_infos
 8658            ].apply(
 8659                lambda x: extract_snpeff_hgvs(
 8660                    str(x), header=list(ann_header_desc.values())
 8661                )
 8662            )
 8663
 8664            # Add snpeff_hgvs to header
 8665            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8666                snpeff_hgvs,
 8667                ".",
 8668                "String",
 8669                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8670                "howard calculation",
 8671                "0",
 8672                self.code_type_map.get("String"),
 8673            )
 8674
 8675            # Update
 8676            sql_update = f"""
 8677                UPDATE variants
 8678                SET "INFO" = 
 8679                    concat(
 8680                        CASE
 8681                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8682                            THEN ''
 8683                            ELSE concat("INFO", ';')
 8684                        END,
 8685                        CASE 
 8686                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8687                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8688                            THEN concat(
 8689                                    '{snpeff_hgvs}=',
 8690                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8691                                )
 8692                            ELSE ''
 8693                        END
 8694                    )
 8695                FROM dataframe_snpeff_hgvs
 8696                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8697
 8698            """
 8699            self.conn.execute(sql_update)
 8700
 8701            # Delete dataframe
 8702            del dataframe_snpeff_hgvs
 8703            gc.collect()
 8704
 8705        else:
 8706
 8707            log.warning(
 8708                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8709            )
 8710
 8711        # Remove added columns
 8712        for added_column in added_columns:
 8713            self.drop_column(column=added_column)
 8714
 8715    def calculation_snpeff_ann_explode(
 8716        self,
 8717        uniquify: bool = True,
 8718        output_format: str = "fields",
 8719        output_prefix: str = "snpeff_",
 8720        snpeff_field: str = "ANN",
 8721    ) -> None:
 8722        """
 8723        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8724        exploding the HGVS field and updating variant information accordingly.
 8725
 8726        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8727        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8728        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8729        defaults to True
 8730        :type uniquify: bool (optional)
 8731        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8732        function specifies the format in which the output annotations will be generated. It has a
 8733        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8734        format, defaults to fields
 8735        :type output_format: str (optional)
 8736        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8737        method is used to specify the prefix that will be added to the output annotations generated
 8738        during the calculation process. This prefix helps to differentiate the newly added annotations
 8739        from existing ones in the output data. By default, the, defaults to ANN_
 8740        :type output_prefix: str (optional)
 8741        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8742        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8743        field will be processed to explode the HGVS annotations and update the variant information
 8744        accordingly, defaults to ANN
 8745        :type snpeff_field: str (optional)
 8746        """
 8747
 8748        # SnpEff annotation field
 8749        snpeff_hgvs = "snpeff_ann_explode"
 8750
 8751        # Snpeff hgvs tags
 8752        vcf_infos_tags = {
 8753            snpeff_hgvs: "Explode snpEff annotations",
 8754        }
 8755
 8756        # Prefix
 8757        prefix = self.get_explode_infos_prefix()
 8758        if prefix:
 8759            prefix = "INFO/"
 8760
 8761        # snpEff fields
 8762        speff_ann_infos = prefix + snpeff_field
 8763        speff_hgvs_infos = prefix + snpeff_hgvs
 8764
 8765        # Variants table
 8766        table_variants = self.get_table_variants()
 8767
 8768        # Header
 8769        vcf_reader = self.get_header()
 8770
 8771        # Add columns
 8772        added_columns = []
 8773
 8774        # Explode HGVS field in column
 8775        added_columns += self.explode_infos(fields=[snpeff_field])
 8776        log.debug(f"snpeff_field={snpeff_field}")
 8777        log.debug(f"added_columns={added_columns}")
 8778
 8779        if snpeff_field in vcf_reader.infos:
 8780
 8781            # Extract ANN header
 8782            ann_description = vcf_reader.infos[snpeff_field].desc
 8783            pattern = r"'(.+?)'"
 8784            match = re.search(pattern, ann_description)
 8785            if match:
 8786                ann_header_match = match.group(1).split(" | ")
 8787                ann_header = []
 8788                ann_header_desc = {}
 8789                for i in range(len(ann_header_match)):
 8790                    ann_header_info = "".join(
 8791                        char for char in ann_header_match[i] if char.isalnum()
 8792                    )
 8793                    ann_header.append(ann_header_info)
 8794                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8795                if not ann_header_desc:
 8796                    raise ValueError("Invalid header description format")
 8797            else:
 8798                raise ValueError("Invalid header description format")
 8799
 8800            # Create variant id
 8801            variant_id_column = self.get_variant_id_column()
 8802            added_columns += [variant_id_column]
 8803
 8804            # Create dataframe
 8805            dataframe_snpeff_hgvs = self.get_query_to_df(
 8806                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8807            )
 8808
 8809            # Create snpEff columns
 8810            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8811                speff_ann_infos
 8812            ].apply(
 8813                lambda x: explode_snpeff_ann(
 8814                    str(x),
 8815                    uniquify=uniquify,
 8816                    output_format=output_format,
 8817                    prefix=output_prefix,
 8818                    header=list(ann_header_desc.values()),
 8819                )
 8820            )
 8821
 8822            # Header
 8823            ann_annotations_prefix = ""
 8824            if output_format.upper() in ["JSON"]:
 8825                ann_annotations_prefix = f"{output_prefix}="
 8826                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8827                    output_prefix,
 8828                    ".",
 8829                    "String",
 8830                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8831                    + " - JSON format",
 8832                    "howard calculation",
 8833                    "0",
 8834                    self.code_type_map.get("String"),
 8835                )
 8836            else:
 8837                for ann_annotation in ann_header:
 8838                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8839                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8840                        ann_annotation_id,
 8841                        ".",
 8842                        "String",
 8843                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8844                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8845                        "howard calculation",
 8846                        "0",
 8847                        self.code_type_map.get("String"),
 8848                    )
 8849
 8850            # Update
 8851            sql_update = f"""
 8852                UPDATE variants
 8853                SET "INFO" = 
 8854                    concat(
 8855                        CASE
 8856                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8857                            THEN ''
 8858                            ELSE concat("INFO", ';')
 8859                        END,
 8860                        CASE 
 8861                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8862                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8863                            THEN concat(
 8864                                '{ann_annotations_prefix}',
 8865                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8866                                )
 8867                            ELSE ''
 8868                        END
 8869                    )
 8870                FROM dataframe_snpeff_hgvs
 8871                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8872
 8873            """
 8874            self.conn.execute(sql_update)
 8875
 8876            # Delete dataframe
 8877            del dataframe_snpeff_hgvs
 8878            gc.collect()
 8879
 8880        else:
 8881
 8882            log.warning(
 8883                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8884            )
 8885
 8886        # Remove added columns
 8887        for added_column in added_columns:
 8888            self.drop_column(column=added_column)
 8889
 8890    def calculation_extract_nomen(self) -> None:
 8891        """
 8892        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8893        """
 8894
 8895        # NOMEN field
 8896        field_nomen_dict = "NOMEN_DICT"
 8897
 8898        # NOMEN structure
 8899        nomen_dict = {
 8900            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8901            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8902            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8903            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8904            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8905            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8906            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8907            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8908            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8909            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8910        }
 8911
 8912        # Param
 8913        param = self.get_param()
 8914
 8915        # Threads
 8916        threads = self.get_threads()
 8917
 8918        # Prefix
 8919        prefix = self.get_explode_infos_prefix()
 8920
 8921        # Header
 8922        vcf_reader = self.get_header()
 8923
 8924        # Added columns
 8925        added_columns = []
 8926
 8927        # Get HGVS field
 8928        hgvs_field = (
 8929            param.get("calculation", {})
 8930            .get("calculations", {})
 8931            .get("NOMEN", {})
 8932            .get("options", {})
 8933            .get("hgvs_field", "hgvs")
 8934        )
 8935
 8936        # Get NOMEN pattern
 8937        nomen_pattern = (
 8938            param.get("calculation", {})
 8939            .get("calculations", {})
 8940            .get("NOMEN", {})
 8941            .get("options", {})
 8942            .get("pattern", None)
 8943        )
 8944
 8945        # transcripts list of preference sources
 8946        transcripts_sources = {}
 8947
 8948        # Get transcripts
 8949        transcripts_file = (
 8950            param.get("calculation", {})
 8951            .get("calculations", {})
 8952            .get("NOMEN", {})
 8953            .get("options", {})
 8954            .get("transcripts", None)
 8955        )
 8956        transcripts_file = full_path(transcripts_file)
 8957        if transcripts_file:
 8958            if os.path.exists(transcripts_file):
 8959                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8960                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8961                transcripts_sources["file"] = transcripts_from_file
 8962            else:
 8963                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8964                log.error(msg_err)
 8965                raise ValueError(msg_err)
 8966
 8967        # Get transcripts table
 8968        transcripts_table = (
 8969            param.get("calculation", {})
 8970            .get("calculations", {})
 8971            .get("NOMEN", {})
 8972            .get("options", {})
 8973            .get("transcripts_table", self.get_table_variants())
 8974        )
 8975        # Get transcripts column
 8976        transcripts_column = (
 8977            param.get("calculation", {})
 8978            .get("calculations", {})
 8979            .get("NOMEN", {})
 8980            .get("options", {})
 8981            .get("transcripts_column", None)
 8982        )
 8983
 8984        if transcripts_table and transcripts_column:
 8985            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8986            # Explode if not exists
 8987            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8988        else:
 8989            extra_field_transcript = f"NULL"
 8990
 8991        # Transcripts of preference source order
 8992        transcripts_order = (
 8993            param.get("calculation", {})
 8994            .get("calculations", {})
 8995            .get("NOMEN", {})
 8996            .get("options", {})
 8997            .get("transcripts_order", ["column", "file"])
 8998        )
 8999
 9000        # Transcripts from file
 9001        transcripts = transcripts_sources.get("file", [])
 9002
 9003        # Explode HGVS field in column
 9004        added_columns += self.explode_infos(fields=[hgvs_field])
 9005
 9006        # extra infos
 9007        extra_infos = self.get_extra_infos()
 9008        extra_field = prefix + hgvs_field
 9009
 9010        if extra_field in extra_infos:
 9011
 9012            # Create dataframe
 9013            dataframe_hgvs = self.get_query_to_df(
 9014                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9015            )
 9016
 9017            # Transcripts rank
 9018            transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)}
 9019            transcripts_len = len(transcripts_rank)
 9020
 9021            # Create main NOMEN column
 9022            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9023                lambda x: find_nomen(
 9024                    hgvs=x.hgvs,
 9025                    transcript=x.transcript,
 9026                    transcripts=transcripts_rank,
 9027                    pattern=nomen_pattern,
 9028                    transcripts_source_order=transcripts_order,
 9029                    transcripts_len=transcripts_len
 9030                ),
 9031                axis=1,
 9032            )
 9033
 9034            # Explode NOMEN Structure and create SQL set for update
 9035            sql_nomen_fields = []
 9036            for nomen_field in nomen_dict:
 9037
 9038                # Create VCF header field
 9039                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9040                    nomen_field,
 9041                    ".",
 9042                    "String",
 9043                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9044                    "howard calculation",
 9045                    "0",
 9046                    self.code_type_map.get("String"),
 9047                )
 9048
 9049                # Add field to SQL query update
 9050                sql_nomen_fields.append(
 9051                    f"""
 9052                        CASE 
 9053                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9054                            THEN concat(
 9055                                    ';{nomen_field}=',
 9056                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9057                                )
 9058                            ELSE ''
 9059                        END
 9060                    """
 9061                )
 9062
 9063            # SQL set for update
 9064            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9065
 9066            # Update
 9067            sql_update = f"""
 9068                UPDATE variants
 9069                SET "INFO" = 
 9070                    concat(
 9071                        CASE
 9072                            WHEN "INFO" IS NULL
 9073                            THEN ''
 9074                            ELSE "INFO"
 9075                        END,
 9076                        {sql_nomen_fields_set}
 9077                    )
 9078                FROM dataframe_hgvs
 9079                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9080                    AND variants."POS" = dataframe_hgvs."POS" 
 9081                    AND variants."REF" = dataframe_hgvs."REF"
 9082                    AND variants."ALT" = dataframe_hgvs."ALT"
 9083            """
 9084            self.conn.execute(sql_update)
 9085
 9086            # Delete dataframe
 9087            del dataframe_hgvs
 9088            gc.collect()
 9089
 9090        # Remove added columns
 9091        for added_column in added_columns:
 9092            self.drop_column(column=added_column)
 9093
 9094    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9095        """
 9096        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9097        pipeline/sample for a variant and updates the variant information in a VCF file.
 9098
 9099        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9100        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9101        VCF header and to update the corresponding field in the variants table, defaults to
 9102        findbypipeline
 9103        :type tag: str (optional)
 9104        """
 9105
 9106        # if FORMAT and samples
 9107        if (
 9108            "FORMAT" in self.get_header_columns_as_list()
 9109            and self.get_header_sample_list()
 9110        ):
 9111
 9112            # findbypipeline annotation field
 9113            findbypipeline_tag = tag
 9114
 9115            # VCF infos tags
 9116            vcf_infos_tags = {
 9117                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9118            }
 9119
 9120            # Prefix
 9121            prefix = self.get_explode_infos_prefix()
 9122
 9123            # Field
 9124            findbypipeline_infos = prefix + findbypipeline_tag
 9125
 9126            # Variants table
 9127            table_variants = self.get_table_variants()
 9128
 9129            # Header
 9130            vcf_reader = self.get_header()
 9131
 9132            # Create variant id
 9133            variant_id_column = self.get_variant_id_column()
 9134            added_columns = [variant_id_column]
 9135
 9136            # variant_id, FORMAT and samples
 9137            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9138                self.get_header_sample_list()
 9139            )
 9140
 9141            # Create dataframe
 9142            dataframe_findbypipeline = self.get_query_to_df(
 9143                f""" SELECT {samples_fields} FROM {table_variants} """
 9144            )
 9145
 9146            # Create findbypipeline column
 9147            dataframe_findbypipeline[findbypipeline_infos] = (
 9148                dataframe_findbypipeline.apply(
 9149                    lambda row: findbypipeline(
 9150                        row, samples=self.get_header_sample_list()
 9151                    ),
 9152                    axis=1,
 9153                )
 9154            )
 9155
 9156            # Add snpeff_hgvs to header
 9157            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9158                findbypipeline_tag,
 9159                ".",
 9160                "String",
 9161                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9162                "howard calculation",
 9163                "0",
 9164                self.code_type_map.get("String"),
 9165            )
 9166
 9167            # Update
 9168            sql_update = f"""
 9169                UPDATE variants
 9170                SET "INFO" = 
 9171                    concat(
 9172                        CASE
 9173                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9174                            THEN ''
 9175                            ELSE concat("INFO", ';')
 9176                        END,
 9177                        CASE 
 9178                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9179                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9180                            THEN concat(
 9181                                    '{findbypipeline_tag}=',
 9182                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9183                                )
 9184                            ELSE ''
 9185                        END
 9186                    )
 9187                FROM dataframe_findbypipeline
 9188                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9189            """
 9190            self.conn.execute(sql_update)
 9191
 9192            # Remove added columns
 9193            for added_column in added_columns:
 9194                self.drop_column(column=added_column)
 9195
 9196            # Delete dataframe
 9197            del dataframe_findbypipeline
 9198            gc.collect()
 9199
 9200    def calculation_genotype_concordance(self) -> None:
 9201        """
 9202        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9203        multi-caller VCF files and updates the variant information in the database.
 9204        """
 9205
 9206        # if FORMAT and samples
 9207        if (
 9208            "FORMAT" in self.get_header_columns_as_list()
 9209            and self.get_header_sample_list()
 9210        ):
 9211
 9212            # genotypeconcordance annotation field
 9213            genotypeconcordance_tag = "genotypeconcordance"
 9214
 9215            # VCF infos tags
 9216            vcf_infos_tags = {
 9217                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9218            }
 9219
 9220            # Prefix
 9221            prefix = self.get_explode_infos_prefix()
 9222
 9223            # Field
 9224            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9225
 9226            # Variants table
 9227            table_variants = self.get_table_variants()
 9228
 9229            # Header
 9230            vcf_reader = self.get_header()
 9231
 9232            # Create variant id
 9233            variant_id_column = self.get_variant_id_column()
 9234            added_columns = [variant_id_column]
 9235
 9236            # variant_id, FORMAT and samples
 9237            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9238                self.get_header_sample_list()
 9239            )
 9240
 9241            # Create dataframe
 9242            dataframe_genotypeconcordance = self.get_query_to_df(
 9243                f""" SELECT {samples_fields} FROM {table_variants} """
 9244            )
 9245
 9246            # Create genotypeconcordance column
 9247            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9248                dataframe_genotypeconcordance.apply(
 9249                    lambda row: genotypeconcordance(
 9250                        row, samples=self.get_header_sample_list()
 9251                    ),
 9252                    axis=1,
 9253                )
 9254            )
 9255
 9256            # Add genotypeconcordance to header
 9257            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9258                genotypeconcordance_tag,
 9259                ".",
 9260                "String",
 9261                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9262                "howard calculation",
 9263                "0",
 9264                self.code_type_map.get("String"),
 9265            )
 9266
 9267            # Update
 9268            sql_update = f"""
 9269                UPDATE variants
 9270                SET "INFO" = 
 9271                    concat(
 9272                        CASE
 9273                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9274                            THEN ''
 9275                            ELSE concat("INFO", ';')
 9276                        END,
 9277                        CASE
 9278                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9279                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9280                            THEN concat(
 9281                                    '{genotypeconcordance_tag}=',
 9282                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9283                                )
 9284                            ELSE ''
 9285                        END
 9286                    )
 9287                FROM dataframe_genotypeconcordance
 9288                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9289            """
 9290            self.conn.execute(sql_update)
 9291
 9292            # Remove added columns
 9293            for added_column in added_columns:
 9294                self.drop_column(column=added_column)
 9295
 9296            # Delete dataframe
 9297            del dataframe_genotypeconcordance
 9298            gc.collect()
 9299
 9300    def calculation_barcode(self, tag: str = "barcode") -> None:
 9301        """
 9302        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9303        updates the INFO field in the file with the calculated barcode values.
 9304
 9305        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9306        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9307        the default tag name is set to "barcode", defaults to barcode
 9308        :type tag: str (optional)
 9309        """
 9310
 9311        # if FORMAT and samples
 9312        if (
 9313            "FORMAT" in self.get_header_columns_as_list()
 9314            and self.get_header_sample_list()
 9315        ):
 9316
 9317            # barcode annotation field
 9318            if not tag:
 9319                tag = "barcode"
 9320
 9321            # VCF infos tags
 9322            vcf_infos_tags = {
 9323                tag: "barcode calculation (VaRank)",
 9324            }
 9325
 9326            # Prefix
 9327            prefix = self.get_explode_infos_prefix()
 9328
 9329            # Field
 9330            barcode_infos = prefix + tag
 9331
 9332            # Variants table
 9333            table_variants = self.get_table_variants()
 9334
 9335            # Header
 9336            vcf_reader = self.get_header()
 9337
 9338            # Create variant id
 9339            variant_id_column = self.get_variant_id_column()
 9340            added_columns = [variant_id_column]
 9341
 9342            # variant_id, FORMAT and samples
 9343            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9344                self.get_header_sample_list()
 9345            )
 9346
 9347            # Create dataframe
 9348            dataframe_barcode = self.get_query_to_df(
 9349                f""" SELECT {samples_fields} FROM {table_variants} """
 9350            )
 9351
 9352            # Create barcode column
 9353            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9354                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9355            )
 9356
 9357            # Add barcode to header
 9358            vcf_reader.infos[tag] = vcf.parser._Info(
 9359                tag,
 9360                ".",
 9361                "String",
 9362                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9363                "howard calculation",
 9364                "0",
 9365                self.code_type_map.get("String"),
 9366            )
 9367
 9368            # Update
 9369            sql_update = f"""
 9370                UPDATE {table_variants}
 9371                SET "INFO" = 
 9372                    concat(
 9373                        CASE
 9374                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9375                            THEN ''
 9376                            ELSE concat("INFO", ';')
 9377                        END,
 9378                        CASE
 9379                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9380                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9381                            THEN concat(
 9382                                    '{tag}=',
 9383                                    dataframe_barcode."{barcode_infos}"
 9384                                )
 9385                            ELSE ''
 9386                        END
 9387                    )
 9388                FROM dataframe_barcode
 9389                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9390            """
 9391            self.conn.execute(sql_update)
 9392
 9393            # Remove added columns
 9394            for added_column in added_columns:
 9395                self.drop_column(column=added_column)
 9396
 9397            # Delete dataframe
 9398            del dataframe_barcode
 9399            gc.collect()
 9400
 9401    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9402        """
 9403        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9404        and updates the INFO field in the file with the calculated barcode values.
 9405
 9406        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9407        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9408        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9409        :type tag: str (optional)
 9410        """
 9411
 9412        # if FORMAT and samples
 9413        if (
 9414            "FORMAT" in self.get_header_columns_as_list()
 9415            and self.get_header_sample_list()
 9416        ):
 9417
 9418            # barcode annotation field
 9419            if not tag:
 9420                tag = "BCF"
 9421
 9422            # VCF infos tags
 9423            vcf_infos_tags = {
 9424                tag: "barcode family calculation",
 9425                f"{tag}S": "barcode family samples",
 9426            }
 9427
 9428            # Param
 9429            param = self.get_param()
 9430            log.debug(f"param={param}")
 9431
 9432            # Prefix
 9433            prefix = self.get_explode_infos_prefix()
 9434
 9435            # PED param
 9436            ped = (
 9437                param.get("calculation", {})
 9438                .get("calculations", {})
 9439                .get("BARCODEFAMILY", {})
 9440                .get("family_pedigree", None)
 9441            )
 9442            log.debug(f"ped={ped}")
 9443
 9444            # Load PED
 9445            if ped:
 9446
 9447                # Pedigree is a file
 9448                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9449                    log.debug("Pedigree is file")
 9450                    with open(full_path(ped)) as ped:
 9451                        ped = yaml.safe_load(ped)
 9452
 9453                # Pedigree is a string
 9454                elif isinstance(ped, str):
 9455                    log.debug("Pedigree is str")
 9456                    try:
 9457                        ped = json.loads(ped)
 9458                        log.debug("Pedigree is json str")
 9459                    except ValueError as e:
 9460                        ped_samples = ped.split(",")
 9461                        ped = {}
 9462                        for ped_sample in ped_samples:
 9463                            ped[ped_sample] = ped_sample
 9464
 9465                # Pedigree is a dict
 9466                elif isinstance(ped, dict):
 9467                    log.debug("Pedigree is dict")
 9468
 9469                # Pedigree is not well formatted
 9470                else:
 9471                    msg_error = "Pedigree not well formatted"
 9472                    log.error(msg_error)
 9473                    raise ValueError(msg_error)
 9474
 9475                # Construct list
 9476                ped_samples = list(ped.values())
 9477
 9478            else:
 9479                log.debug("Pedigree not defined. Take all samples")
 9480                ped_samples = self.get_header_sample_list()
 9481                ped = {}
 9482                for ped_sample in ped_samples:
 9483                    ped[ped_sample] = ped_sample
 9484
 9485            # Check pedigree
 9486            if not ped or len(ped) == 0:
 9487                msg_error = f"Error in pedigree: samples {ped_samples}"
 9488                log.error(msg_error)
 9489                raise ValueError(msg_error)
 9490
 9491            # Log
 9492            log.info(
 9493                "Calculation 'BARCODEFAMILY' - Samples: "
 9494                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9495            )
 9496            log.debug(f"ped_samples={ped_samples}")
 9497
 9498            # Field
 9499            barcode_infos = prefix + tag
 9500
 9501            # Variants table
 9502            table_variants = self.get_table_variants()
 9503
 9504            # Header
 9505            vcf_reader = self.get_header()
 9506
 9507            # Create variant id
 9508            variant_id_column = self.get_variant_id_column()
 9509            added_columns = [variant_id_column]
 9510
 9511            # variant_id, FORMAT and samples
 9512            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9513                ped_samples
 9514            )
 9515
 9516            # Create dataframe
 9517            dataframe_barcode = self.get_query_to_df(
 9518                f""" SELECT {samples_fields} FROM {table_variants} """
 9519            )
 9520
 9521            # Create barcode column
 9522            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9523                lambda row: barcode(row, samples=ped_samples), axis=1
 9524            )
 9525
 9526            # Add barcode family to header
 9527            # Add vaf_normalization to header
 9528            vcf_reader.formats[tag] = vcf.parser._Format(
 9529                id=tag,
 9530                num=".",
 9531                type="String",
 9532                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9533                type_code=self.code_type_map.get("String"),
 9534            )
 9535            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9536                id=f"{tag}S",
 9537                num=".",
 9538                type="String",
 9539                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9540                type_code=self.code_type_map.get("String"),
 9541            )
 9542
 9543            # Update
 9544            # for sample in ped_samples:
 9545            sql_update_set = []
 9546            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9547                if sample in ped_samples:
 9548                    value = f'dataframe_barcode."{barcode_infos}"'
 9549                    value_samples = "'" + ",".join(ped_samples) + "'"
 9550                elif sample == "FORMAT":
 9551                    value = f"'{tag}'"
 9552                    value_samples = f"'{tag}S'"
 9553                else:
 9554                    value = "'.'"
 9555                    value_samples = "'.'"
 9556                format_regex = r"[a-zA-Z0-9\s]"
 9557                sql_update_set.append(
 9558                    f"""
 9559                        "{sample}" = 
 9560                        concat(
 9561                            CASE
 9562                                WHEN {table_variants}."{sample}" = './.'
 9563                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9564                                ELSE {table_variants}."{sample}"
 9565                            END,
 9566                            ':',
 9567                            {value},
 9568                            ':',
 9569                            {value_samples}
 9570                        )
 9571                    """
 9572                )
 9573
 9574            sql_update_set_join = ", ".join(sql_update_set)
 9575            sql_update = f"""
 9576                UPDATE {table_variants}
 9577                SET {sql_update_set_join}
 9578                FROM dataframe_barcode
 9579                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9580            """
 9581            self.conn.execute(sql_update)
 9582
 9583            # Remove added columns
 9584            for added_column in added_columns:
 9585                self.drop_column(column=added_column)
 9586
 9587            # Delete dataframe
 9588            del dataframe_barcode
 9589            gc.collect()
 9590
 9591    def calculation_trio(self) -> None:
 9592        """
 9593        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9594        information to the INFO field of each variant.
 9595        """
 9596
 9597        # if FORMAT and samples
 9598        if (
 9599            "FORMAT" in self.get_header_columns_as_list()
 9600            and self.get_header_sample_list()
 9601        ):
 9602
 9603            # trio annotation field
 9604            trio_tag = "trio"
 9605
 9606            # VCF infos tags
 9607            vcf_infos_tags = {
 9608                "trio": "trio calculation",
 9609            }
 9610
 9611            # Param
 9612            param = self.get_param()
 9613
 9614            # Prefix
 9615            prefix = self.get_explode_infos_prefix()
 9616
 9617            # Trio param
 9618            trio_ped = (
 9619                param.get("calculation", {})
 9620                .get("calculations", {})
 9621                .get("TRIO", {})
 9622                .get("trio_pedigree", None)
 9623            )
 9624
 9625            # Load trio
 9626            if trio_ped:
 9627
 9628                # Trio pedigree is a file
 9629                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9630                    log.debug("TRIO pedigree is file")
 9631                    with open(full_path(trio_ped)) as trio_ped:
 9632                        trio_ped = yaml.safe_load(trio_ped)
 9633
 9634                # Trio pedigree is a string
 9635                elif isinstance(trio_ped, str):
 9636                    log.debug("TRIO pedigree is str")
 9637                    try:
 9638                        trio_ped = json.loads(trio_ped)
 9639                        log.debug("TRIO pedigree is json str")
 9640                    except ValueError as e:
 9641                        trio_samples = trio_ped.split(",")
 9642                        if len(trio_samples) == 3:
 9643                            trio_ped = {
 9644                                "father": trio_samples[0],
 9645                                "mother": trio_samples[1],
 9646                                "child": trio_samples[2],
 9647                            }
 9648                            log.debug("TRIO pedigree is list str")
 9649                        else:
 9650                            msg_error = "TRIO pedigree not well formatted"
 9651                            log.error(msg_error)
 9652                            raise ValueError(msg_error)
 9653
 9654                # Trio pedigree is a dict
 9655                elif isinstance(trio_ped, dict):
 9656                    log.debug("TRIO pedigree is dict")
 9657
 9658                # Trio pedigree is not well formatted
 9659                else:
 9660                    msg_error = "TRIO pedigree not well formatted"
 9661                    log.error(msg_error)
 9662                    raise ValueError(msg_error)
 9663
 9664                # Construct trio list
 9665                trio_samples = [
 9666                    trio_ped.get("father", ""),
 9667                    trio_ped.get("mother", ""),
 9668                    trio_ped.get("child", ""),
 9669                ]
 9670
 9671            else:
 9672                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9673                samples_list = self.get_header_sample_list()
 9674                if len(samples_list) >= 3:
 9675                    trio_samples = self.get_header_sample_list()[0:3]
 9676                    trio_ped = {
 9677                        "father": trio_samples[0],
 9678                        "mother": trio_samples[1],
 9679                        "child": trio_samples[2],
 9680                    }
 9681                else:
 9682                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9683                    log.error(msg_error)
 9684                    raise ValueError(msg_error)
 9685
 9686            # Check trio pedigree
 9687            if not trio_ped or len(trio_ped) != 3:
 9688                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9689                log.error(msg_error)
 9690                raise ValueError(msg_error)
 9691
 9692            # Log
 9693            log.info(
 9694                f"Calculation 'TRIO' - Samples: "
 9695                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9696            )
 9697
 9698            # Field
 9699            trio_infos = prefix + trio_tag
 9700
 9701            # Variants table
 9702            table_variants = self.get_table_variants()
 9703
 9704            # Header
 9705            vcf_reader = self.get_header()
 9706
 9707            # Create variant id
 9708            variant_id_column = self.get_variant_id_column()
 9709            added_columns = [variant_id_column]
 9710
 9711            # variant_id, FORMAT and samples
 9712            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9713                self.get_header_sample_list()
 9714            )
 9715
 9716            # Create dataframe
 9717            dataframe_trio = self.get_query_to_df(
 9718                f""" SELECT {samples_fields} FROM {table_variants} """
 9719            )
 9720
 9721            # Create trio column
 9722            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9723                lambda row: trio(row, samples=trio_samples), axis=1
 9724            )
 9725
 9726            # Add trio to header
 9727            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9728                trio_tag,
 9729                ".",
 9730                "String",
 9731                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9732                "howard calculation",
 9733                "0",
 9734                self.code_type_map.get("String"),
 9735            )
 9736
 9737            # Update
 9738            sql_update = f"""
 9739                UPDATE {table_variants}
 9740                SET "INFO" = 
 9741                    concat(
 9742                        CASE
 9743                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9744                            THEN ''
 9745                            ELSE concat("INFO", ';')
 9746                        END,
 9747                        CASE
 9748                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9749                             AND dataframe_trio."{trio_infos}" NOT NULL
 9750                            THEN concat(
 9751                                    '{trio_tag}=',
 9752                                    dataframe_trio."{trio_infos}"
 9753                                )
 9754                            ELSE ''
 9755                        END
 9756                    )
 9757                FROM dataframe_trio
 9758                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9759            """
 9760            self.conn.execute(sql_update)
 9761
 9762            # Remove added columns
 9763            for added_column in added_columns:
 9764                self.drop_column(column=added_column)
 9765
 9766            # Delete dataframe
 9767            del dataframe_trio
 9768            gc.collect()
 9769
 9770    def calculation_vaf_normalization(self) -> None:
 9771        """
 9772        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9773        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9774        :return: The function does not return anything.
 9775        """
 9776
 9777        # if FORMAT and samples
 9778        if (
 9779            "FORMAT" in self.get_header_columns_as_list()
 9780            and self.get_header_sample_list()
 9781        ):
 9782
 9783            # vaf_normalization annotation field
 9784            vaf_normalization_tag = "VAF"
 9785
 9786            # VCF infos tags
 9787            vcf_infos_tags = {
 9788                "VAF": "VAF Variant Frequency",
 9789            }
 9790
 9791            # Prefix
 9792            prefix = self.get_explode_infos_prefix()
 9793
 9794            # Variants table
 9795            table_variants = self.get_table_variants()
 9796
 9797            # Header
 9798            vcf_reader = self.get_header()
 9799
 9800            # Do not calculate if VAF already exists
 9801            if "VAF" in vcf_reader.formats:
 9802                log.debug("VAF already on genotypes")
 9803                return
 9804
 9805            # Create variant id
 9806            variant_id_column = self.get_variant_id_column()
 9807            added_columns = [variant_id_column]
 9808
 9809            # variant_id, FORMAT and samples
 9810            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9811                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9812            )
 9813
 9814            # Create dataframe
 9815            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9816            log.debug(f"query={query}")
 9817            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9818
 9819            vaf_normalization_set = []
 9820
 9821            # for each sample vaf_normalization
 9822            for sample in self.get_header_sample_list():
 9823                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9824                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9825                )
 9826                vaf_normalization_set.append(
 9827                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9828                )
 9829
 9830            # Add VAF to FORMAT
 9831            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9832                "FORMAT"
 9833            ].apply(lambda x: str(x) + ":VAF")
 9834            vaf_normalization_set.append(
 9835                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9836            )
 9837
 9838            # Add vaf_normalization to header
 9839            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9840                id=vaf_normalization_tag,
 9841                num="1",
 9842                type="Float",
 9843                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9844                type_code=self.code_type_map.get("Float"),
 9845            )
 9846
 9847            # Create fields to add in INFO
 9848            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9849
 9850            # Update
 9851            sql_update = f"""
 9852                UPDATE {table_variants}
 9853                SET {sql_vaf_normalization_set}
 9854                FROM dataframe_vaf_normalization
 9855                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9856
 9857            """
 9858            self.conn.execute(sql_update)
 9859
 9860            # Remove added columns
 9861            for added_column in added_columns:
 9862                self.drop_column(column=added_column)
 9863
 9864            # Delete dataframe
 9865            del dataframe_vaf_normalization
 9866            gc.collect()
 9867
 9868    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9869        """
 9870        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9871        field in a VCF file and updates the INFO column of the variants table with the calculated
 9872        statistics.
 9873
 9874        :param info: The `info` parameter is a string that represents the type of information for which
 9875        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9876        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9877        maximum value, the mean, the median, defaults to VAF
 9878        :type info: str (optional)
 9879        """
 9880
 9881        # if FORMAT and samples
 9882        if (
 9883            "FORMAT" in self.get_header_columns_as_list()
 9884            and self.get_header_sample_list()
 9885        ):
 9886
 9887            # vaf_stats annotation field
 9888            vaf_stats_tag = info + "_stats"
 9889
 9890            # VCF infos tags
 9891            vcf_infos_tags = {
 9892                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9893                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9894                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9895                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9896                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9897                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9898                info
 9899                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9900            }
 9901
 9902            # Prefix
 9903            prefix = self.get_explode_infos_prefix()
 9904
 9905            # Field
 9906            vaf_stats_infos = prefix + vaf_stats_tag
 9907
 9908            # Variants table
 9909            table_variants = self.get_table_variants()
 9910
 9911            # Header
 9912            vcf_reader = self.get_header()
 9913
 9914            # Create variant id
 9915            variant_id_column = self.get_variant_id_column()
 9916            added_columns = [variant_id_column]
 9917
 9918            # variant_id, FORMAT and samples
 9919            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9920                self.get_header_sample_list()
 9921            )
 9922
 9923            # Create dataframe
 9924            dataframe_vaf_stats = self.get_query_to_df(
 9925                f""" SELECT {samples_fields} FROM {table_variants} """
 9926            )
 9927
 9928            # Create vaf_stats column
 9929            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9930                lambda row: genotype_stats(
 9931                    row, samples=self.get_header_sample_list(), info=info
 9932                ),
 9933                axis=1,
 9934            )
 9935
 9936            # List of vcf tags
 9937            sql_vaf_stats_fields = []
 9938
 9939            # Check all VAF stats infos
 9940            for stat in vcf_infos_tags:
 9941
 9942                # Extract stats
 9943                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9944                    lambda x: dict(x).get(stat, "")
 9945                )
 9946
 9947                # Add snpeff_hgvs to header
 9948                vcf_reader.infos[stat] = vcf.parser._Info(
 9949                    stat,
 9950                    ".",
 9951                    "String",
 9952                    vcf_infos_tags.get(stat, "genotype statistics"),
 9953                    "howard calculation",
 9954                    "0",
 9955                    self.code_type_map.get("String"),
 9956                )
 9957
 9958                if len(sql_vaf_stats_fields):
 9959                    sep = ";"
 9960                else:
 9961                    sep = ""
 9962
 9963                # Create fields to add in INFO
 9964                sql_vaf_stats_fields.append(
 9965                    f"""
 9966                        CASE
 9967                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9968                            THEN concat(
 9969                                    '{sep}{stat}=',
 9970                                    dataframe_vaf_stats."{stat}"
 9971                                )
 9972                            ELSE ''
 9973                        END
 9974                    """
 9975                )
 9976
 9977            # SQL set for update
 9978            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9979
 9980            # Update
 9981            sql_update = f"""
 9982                UPDATE {table_variants}
 9983                SET "INFO" = 
 9984                    concat(
 9985                        CASE
 9986                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9987                            THEN ''
 9988                            ELSE concat("INFO", ';')
 9989                        END,
 9990                        {sql_vaf_stats_fields_set}
 9991                    )
 9992                FROM dataframe_vaf_stats
 9993                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9994
 9995            """
 9996            self.conn.execute(sql_update)
 9997
 9998            # Remove added columns
 9999            for added_column in added_columns:
10000                self.drop_column(column=added_column)
10001
10002            # Delete dataframe
10003            del dataframe_vaf_stats
10004            gc.collect()
10005
10006    def calculation_transcripts_annotation(
10007        self, info_json: str = None, info_format: str = None
10008    ) -> None:
10009        """
10010        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10011        field to it if transcripts are available.
10012
10013        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10014        is a string parameter that represents the information field to be used in the transcripts JSON.
10015        It is used to specify the JSON format for the transcripts information. If no value is provided
10016        when calling the method, it defaults to "
10017        :type info_json: str
10018        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10019        method is a string parameter that specifies the format of the information field to be used in
10020        the transcripts JSON. It is used to define the format of the information field
10021        :type info_format: str
10022        """
10023
10024        # Create transcripts table
10025        transcripts_table = self.create_transcript_view()
10026
10027        # Add info field
10028        if transcripts_table:
10029            self.transcript_view_to_variants(
10030                transcripts_table=transcripts_table,
10031                transcripts_info_field_json=info_json,
10032                transcripts_info_field_format=info_format,
10033            )
10034        else:
10035            log.info("No Transcripts to process. Check param.json file configuration")
10036
10037    def calculation_transcripts_prioritization(self) -> None:
10038        """
10039        The function `calculation_transcripts_prioritization` creates a transcripts table and
10040        prioritizes transcripts based on certain criteria.
10041        """
10042
10043        # Create transcripts table
10044        transcripts_table = self.create_transcript_view()
10045
10046        # Add info field
10047        if transcripts_table:
10048            self.transcripts_prioritization(transcripts_table=transcripts_table)
10049        else:
10050            log.info("No Transcripts to process. Check param.json file configuration")
10051
10052    def calculation_transcripts_export(self) -> None:
10053        """ """
10054
10055        # Create transcripts table
10056        transcripts_table = self.create_transcript_view()
10057
10058        # Add info field
10059        if transcripts_table:
10060            self.transcripts_export(transcripts_table=transcripts_table)
10061        else:
10062            log.info("No Transcripts to process. Check param.json file configuration")
10063
10064    ###############
10065    # Transcripts #
10066    ###############
10067
10068    def transcripts_export(
10069        self, transcripts_table: str = None, param: dict = {}
10070    ) -> bool:
10071        """ """
10072
10073        log.debug("Start transcripts export...")
10074
10075        # Param
10076        if not param:
10077            param = self.get_param()
10078
10079        # Param export
10080        param_transcript_export = param.get("transcripts", {}).get("export", {})
10081
10082        # Output file
10083        transcripts_export_output = param_transcript_export.get("output", None)
10084
10085        if not param_transcript_export or not transcripts_export_output:
10086            log.warning(f"No transcriipts export parameters defined!")
10087            return False
10088
10089        # List of transcripts annotations
10090        query_describe = f"""
10091            SELECT column_name
10092            FROM (
10093                    DESCRIBE SELECT * FROM {transcripts_table}
10094                )
10095            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10096        """
10097        transcripts_annotations_list = list(
10098            self.get_query_to_df(query=query_describe)["column_name"]
10099        )
10100
10101        # Create transcripts table for export
10102        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10103            random.choices(string.ascii_uppercase + string.digits, k=10)
10104        )
10105        query_create_transcripts_table_export = f"""
10106            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10107        """
10108        self.execute_query(query=query_create_transcripts_table_export)
10109
10110        # Output file format
10111        transcripts_export_output_format = get_file_format(
10112            filename=transcripts_export_output
10113        )
10114
10115        # Format VCF - construct INFO
10116        if transcripts_export_output_format in ["vcf"]:
10117
10118            # Construct query update INFO and header
10119            query_update_info = []
10120            for field in transcripts_annotations_list:
10121
10122                # If field not in header
10123                if field not in self.get_header_infos_list():
10124
10125                    # Add PZ Transcript in header
10126                    self.get_header().infos[field] = vcf.parser._Info(
10127                        field,
10128                        ".",
10129                        "String",
10130                        f"Annotation '{field}' from transcript view",
10131                        "unknown",
10132                        "unknown",
10133                        0,
10134                    )
10135
10136                # Add field as INFO/tag
10137                query_update_info.append(
10138                    f"""
10139                        CASE
10140                            WHEN "{field}" IS NOT NULL
10141                            THEN concat('{field}=', "{field}", ';')    
10142                            ELSE ''     
10143                        END
10144                        """
10145                )
10146
10147            # Query param
10148            query_update_info_value = (
10149                f""" concat('',  {", ".join(query_update_info)}) """
10150            )
10151            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10152
10153        else:
10154
10155            # Query param
10156            query_update_info_value = f""" NULL """
10157            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10158
10159        # Update query INFO column
10160        query_update = f"""
10161            UPDATE {transcripts_table_export}
10162            SET INFO = {query_update_info_value}
10163
10164        """
10165        self.execute_query(query=query_update)
10166
10167        # Export
10168        self.export_output(
10169            output_file=transcripts_export_output,
10170            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10171        )
10172
10173        # Drop transcripts export table
10174        query_drop_transcripts_table_export = f"""
10175            DROP TABLE {transcripts_table_export}
10176        """
10177        self.execute_query(query=query_drop_transcripts_table_export)
10178
10179    def transcripts_prioritization(
10180        self, transcripts_table: str = None, param: dict = {}
10181    ) -> bool:
10182        """
10183        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10184        and updates the variants table with the prioritized information.
10185
10186        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10187        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10188        This parameter is used to identify the table where the transcripts data is stored for the
10189        prioritization process
10190        :type transcripts_table: str
10191        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10192        that contains various configuration settings for the prioritization process of transcripts. It
10193        is used to customize the behavior of the prioritization algorithm and includes settings such as
10194        the prefix for prioritization fields, default profiles, and other
10195        :type param: dict
10196        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10197        transcripts prioritization process is successfully completed, and `False` if there are any
10198        issues or if no profile is defined for transcripts prioritization.
10199        """
10200
10201        log.debug("Start transcripts prioritization...")
10202
10203        # Param
10204        if not param:
10205            param = self.get_param()
10206
10207        # Variants table
10208        table_variants = self.get_table_variants()
10209
10210        # Transcripts table
10211        if transcripts_table is None:
10212            transcripts_table = self.create_transcript_view(
10213                transcripts_table="transcripts", param=param
10214            )
10215        if transcripts_table is None:
10216            msg_err = "No Transcripts table availalble"
10217            log.error(msg_err)
10218            raise ValueError(msg_err)
10219        log.debug(f"transcripts_table={transcripts_table}")
10220
10221        # Get transcripts columns
10222        columns_as_list_query = f"""
10223            DESCRIBE {transcripts_table}
10224        """
10225        columns_as_list = list(
10226            self.get_query_to_df(columns_as_list_query)["column_name"]
10227        )
10228
10229        # Create INFO if not exists
10230        if "INFO" not in columns_as_list:
10231            query_add_info = f"""
10232                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10233            """
10234            self.execute_query(query_add_info)
10235
10236        # Prioritization param and Force only PZ Score and Flag
10237        pz_param = param.get("transcripts", {}).get("prioritization", {})
10238
10239        # PZ profile by default
10240        pz_profile_default = (
10241            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10242        )
10243
10244        # Exit if no profile
10245        if pz_profile_default is None:
10246            log.warning("No profile defined for transcripts prioritization")
10247            return False
10248
10249        # PZ fields
10250        pz_param_pzfields = {}
10251
10252        # PZ field transcripts
10253        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10254
10255        # Add PZ Transcript in header
10256        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10257            pz_fields_transcripts,
10258            ".",
10259            "String",
10260            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10261            "unknown",
10262            "unknown",
10263            code_type_map["String"],
10264        )
10265
10266        # Mandatory fields
10267        pz_mandatory_fields_list = [
10268            "Score",
10269            "Flag",
10270            "Tags",
10271            "Comment",
10272            "Infos",
10273            "Class",
10274        ]
10275        pz_mandatory_fields = []
10276        for pz_mandatory_field in pz_mandatory_fields_list:
10277            pz_mandatory_fields.append(
10278                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10279            )
10280
10281        # PZ fields in param
10282        for pz_field in pz_param.get("pzfields", []):
10283            if pz_field in pz_mandatory_fields_list:
10284                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10285                    pz_param.get("pzprefix", "PTZ") + pz_field
10286                )
10287            else:
10288                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10289                pz_param_pzfields[pz_field] = pz_field_new
10290
10291                # Add PZ Transcript in header
10292                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10293                    pz_field_new,
10294                    ".",
10295                    "String",
10296                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10297                    "unknown",
10298                    "unknown",
10299                    code_type_map["String"],
10300                )
10301
10302        # PZ fields param
10303        pz_param["pzfields"] = pz_mandatory_fields
10304
10305        # Prioritization
10306        prioritization_result = self.prioritization(
10307            table=transcripts_table,
10308            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10309        )
10310        if not prioritization_result:
10311            log.warning("Transcripts prioritization not processed")
10312            return False
10313
10314        # PZ fields sql query
10315        query_update_select_list = []
10316        query_update_concat_list = []
10317        query_update_order_list = []
10318        for pz_param_pzfield in set(
10319            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10320        ):
10321            query_update_select_list.append(f" {pz_param_pzfield}, ")
10322
10323        for pz_param_pzfield in pz_param_pzfields:
10324            query_update_concat_list.append(
10325                f"""
10326                    , CASE 
10327                        WHEN {pz_param_pzfield} IS NOT NULL
10328                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10329                        ELSE ''
10330                    END
10331                """
10332            )
10333
10334        # Order by
10335        pz_orders = (
10336            param.get("transcripts", {})
10337            .get("prioritization", {})
10338            .get("prioritization_transcripts_order", {})
10339        )
10340        if not pz_orders:
10341            pz_orders = {
10342                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10343                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10344            }
10345        for pz_order in pz_orders:
10346            query_update_order_list.append(
10347                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10348            )
10349
10350        # Fields to explode
10351        fields_to_explode = (
10352            list(pz_param_pzfields.keys())
10353            + pz_mandatory_fields
10354            + list(pz_orders.keys())
10355        )
10356        # Remove transcript column as a specific transcript column
10357        if "transcript" in fields_to_explode:
10358            fields_to_explode.remove("transcript")
10359
10360        # Fields intranscripts table
10361        query_transcripts_table = f"""
10362            DESCRIBE SELECT * FROM {transcripts_table}
10363        """
10364        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10365
10366        # Check fields to explode
10367        for field_to_explode in fields_to_explode:
10368            if field_to_explode not in self.get_header_infos_list() + list(
10369                query_transcripts_table.column_name
10370            ):
10371                msg_err = f"INFO/{field_to_explode} NOT IN header"
10372                log.error(msg_err)
10373                raise ValueError(msg_err)
10374
10375        # Explode fields to explode
10376        self.explode_infos(
10377            table=transcripts_table,
10378            fields=fields_to_explode,
10379        )
10380
10381        # Transcript preference file
10382        transcripts_preference_file = (
10383            param.get("transcripts", {})
10384            .get("prioritization", {})
10385            .get("prioritization_transcripts", {})
10386        )
10387        transcripts_preference_file = full_path(transcripts_preference_file)
10388
10389        # Transcript preference forced
10390        transcript_preference_force = (
10391            param.get("transcripts", {})
10392            .get("prioritization", {})
10393            .get("prioritization_transcripts_force", False)
10394        )
10395        # Transcript version forced
10396        transcript_version_force = (
10397            param.get("transcripts", {})
10398            .get("prioritization", {})
10399            .get("prioritization_transcripts_version_force", False)
10400        )
10401
10402        # Transcripts Ranking
10403        if transcripts_preference_file:
10404
10405            # Transcripts file to dataframe
10406            if os.path.exists(transcripts_preference_file):
10407                transcripts_preference_dataframe = transcripts_file_to_df(
10408                    transcripts_preference_file
10409                )
10410            else:
10411                log.error(
10412                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10413                )
10414                raise ValueError(
10415                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10416                )
10417
10418            # Order by depending to transcript preference forcing
10419            if transcript_preference_force:
10420                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10421            else:
10422                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10423
10424            # Transcript columns joined depend on version consideration
10425            if transcript_version_force:
10426                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10427            else:
10428                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10429
10430            # Query ranking for update
10431            query_update_ranking = f"""
10432                SELECT
10433                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10434                    ROW_NUMBER() OVER (
10435                        PARTITION BY "#CHROM", POS, REF, ALT
10436                        ORDER BY {order_by}
10437                    ) AS rn
10438                FROM {transcripts_table}
10439                LEFT JOIN 
10440                    (
10441                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10442                        FROM transcripts_preference_dataframe
10443                    ) AS transcripts_preference
10444                ON {transcripts_version_join}
10445            """
10446
10447        else:
10448
10449            # Query ranking for update
10450            query_update_ranking = f"""
10451                SELECT
10452                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10453                    ROW_NUMBER() OVER (
10454                        PARTITION BY "#CHROM", POS, REF, ALT
10455                        ORDER BY {" , ".join(query_update_order_list)}
10456                    ) AS rn
10457                FROM {transcripts_table}
10458            """
10459
10460        # Export Transcripts prioritization infos to variants table
10461        query_update = f"""
10462            WITH RankedTranscripts AS (
10463                {query_update_ranking}
10464            )
10465            UPDATE {table_variants}
10466                SET
10467                INFO = CONCAT(CASE
10468                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10469                            THEN ''
10470                            ELSE concat("INFO", ';')
10471                        END,
10472                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10473                        )
10474            FROM
10475                RankedTranscripts
10476            WHERE
10477                rn = 1
10478                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10479                AND variants."POS" = RankedTranscripts."POS"
10480                AND variants."REF" = RankedTranscripts."REF"
10481                AND variants."ALT" = RankedTranscripts."ALT"     
10482        """
10483
10484        # log.debug(f"query_update={query_update}")
10485        self.execute_query(query=query_update)
10486
10487        # Return
10488        return True
10489
10490    def create_transcript_view_from_columns_map(
10491        self,
10492        transcripts_table: str = "transcripts",
10493        columns_maps: dict = {},
10494        added_columns: list = [],
10495        temporary_tables: list = None,
10496        annotation_fields: list = None,
10497        column_rename: dict = {},
10498        column_clean: bool = False,
10499        column_case: str = None,
10500    ) -> tuple[list, list, list]:
10501        """
10502        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10503        specified columns mapping for transcripts data.
10504
10505        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10506        of the table where the transcripts data is stored or will be stored in the database. This table
10507        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10508        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10509        :type transcripts_table: str (optional)
10510        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10511        about how to map columns from a transcripts table to create a view. Each entry in the
10512        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10513        typically includes details such as the main transcript column and additional information columns
10514        :type columns_maps: dict
10515        :param added_columns: The `added_columns` parameter in the
10516        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10517        that will be added to the view being created based on the columns map provided. These columns
10518        are generated by exploding the transcript information columns along with the main transcript
10519        column
10520        :type added_columns: list
10521        :param temporary_tables: The `temporary_tables` parameter in the
10522        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10523        tables created during the process of creating a transcript view from a columns map. These
10524        temporary tables are used to store intermediate results or transformations before the final view
10525        is generated
10526        :type temporary_tables: list
10527        :param annotation_fields: The `annotation_fields` parameter in the
10528        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10529        used for annotation in the query view creation process. These fields are extracted from the
10530        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10531        :type annotation_fields: list
10532        :param column_rename: The `column_rename` parameter in the
10533        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10534        custom renaming for columns during the creation of the temporary table view. This parameter
10535        provides a mapping of original column names to the desired renamed column names. By using this
10536        parameter,
10537        :type column_rename: dict
10538        :param column_clean: The `column_clean` parameter in the
10539        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10540        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10541        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10542        False
10543        :type column_clean: bool (optional)
10544        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10545        function is used to specify the case transformation to be applied to the columns during the view
10546        creation process. It allows you to control whether the column values should be converted to
10547        lowercase, uppercase, or remain unchanged
10548        :type column_case: str
10549        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10550        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10551        """
10552
10553        log.debug("Start transcrpts view creation from columns map...")
10554
10555        # "from_columns_map": [
10556        #     {
10557        #         "transcripts_column": "Ensembl_transcriptid",
10558        #         "transcripts_infos_columns": [
10559        #             "genename",
10560        #             "Ensembl_geneid",
10561        #             "LIST_S2_score",
10562        #             "LIST_S2_pred",
10563        #         ],
10564        #     },
10565        #     {
10566        #         "transcripts_column": "Ensembl_transcriptid",
10567        #         "transcripts_infos_columns": [
10568        #             "genename",
10569        #             "VARITY_R_score",
10570        #             "Aloft_pred",
10571        #         ],
10572        #     },
10573        # ],
10574
10575        # Init
10576        if temporary_tables is None:
10577            temporary_tables = []
10578        if annotation_fields is None:
10579            annotation_fields = []
10580
10581        # Variants table
10582        table_variants = self.get_table_variants()
10583
10584        for columns_map in columns_maps:
10585
10586            # Transcript column
10587            transcripts_column = columns_map.get("transcripts_column", None)
10588
10589            # Transcripts infos columns
10590            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10591
10592            # Transcripts infos columns rename
10593            column_rename = columns_map.get("column_rename", column_rename)
10594
10595            # Transcripts infos columns clean
10596            column_clean = columns_map.get("column_clean", column_clean)
10597
10598            # Transcripts infos columns case
10599            column_case = columns_map.get("column_case", column_case)
10600
10601            if transcripts_column is not None:
10602
10603                # Explode
10604                added_columns += self.explode_infos(
10605                    fields=[transcripts_column] + transcripts_infos_columns
10606                )
10607
10608                # View clauses
10609                clause_select_variants = []
10610                clause_select_tanscripts = []
10611                for field in [transcripts_column] + transcripts_infos_columns:
10612
10613                    # AS field
10614                    as_field = field
10615
10616                    # Rename
10617                    if column_rename:
10618                        as_field = column_rename.get(as_field, as_field)
10619
10620                    # Clean
10621                    if column_clean:
10622                        as_field = clean_annotation_field(as_field)
10623
10624                    # Case
10625                    if column_case:
10626                        if column_case.lower() in ["lower"]:
10627                            as_field = as_field.lower()
10628                        elif column_case.lower() in ["upper"]:
10629                            as_field = as_field.upper()
10630
10631                    # Clause select Variants
10632                    clause_select_variants.append(
10633                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10634                    )
10635
10636                    if field in [transcripts_column]:
10637                        clause_select_tanscripts.append(
10638                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10639                        )
10640                    else:
10641                        clause_select_tanscripts.append(
10642                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10643                        )
10644                        annotation_fields.append(as_field)
10645
10646                # Querey View
10647                query = f""" 
10648                    SELECT
10649                        "#CHROM", POS, REF, ALT, INFO,
10650                        "{transcripts_column}" AS 'transcript',
10651                        {", ".join(clause_select_tanscripts)}
10652                    FROM (
10653                        SELECT 
10654                            "#CHROM", POS, REF, ALT, INFO,
10655                            {", ".join(clause_select_variants)}
10656                        FROM {table_variants}
10657                        )
10658                    WHERE "{transcripts_column}" IS NOT NULL
10659                """
10660
10661                # Create temporary table
10662                temporary_table = transcripts_table + "".join(
10663                    random.choices(string.ascii_uppercase + string.digits, k=10)
10664                )
10665
10666                # Temporary_tables
10667                temporary_tables.append(temporary_table)
10668                query_view = f"""
10669                    CREATE TEMPORARY TABLE {temporary_table}
10670                    AS ({query})
10671                """
10672                self.execute_query(query=query_view)
10673
10674        return added_columns, temporary_tables, annotation_fields
10675
10676    def create_transcript_view_from_column_format(
10677        self,
10678        transcripts_table: str = "transcripts",
10679        column_formats: dict = {},
10680        temporary_tables: list = None,
10681        annotation_fields: list = None,
10682        column_rename: dict = {},
10683        column_clean: bool = False,
10684        column_case: str = None,
10685    ) -> tuple[list, list, list]:
10686        """
10687        The `create_transcript_view_from_column_format` function generates a transcript view based on
10688        specified column formats, adds additional columns and annotation fields, and returns the list of
10689        temporary tables and annotation fields.
10690
10691        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10692        of the table containing the transcripts data. This table will be used as the base table for
10693        creating the transcript view. The default value for this parameter is "transcripts", but you can
10694        provide a different table name if needed, defaults to transcripts
10695        :type transcripts_table: str (optional)
10696        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10697        about the columns to be used for creating the transcript view. Each entry in the dictionary
10698        specifies the mapping between a transcripts column and a transcripts infos column. This
10699        parameter allows you to define how the columns from the transcripts table should be transformed
10700        or mapped
10701        :type column_formats: dict
10702        :param temporary_tables: The `temporary_tables` parameter in the
10703        `create_transcript_view_from_column_format` function is a list that stores the names of
10704        temporary views created during the process of creating a transcript view from a column format.
10705        These temporary views are used to manipulate and extract data before generating the final
10706        transcript view
10707        :type temporary_tables: list
10708        :param annotation_fields: The `annotation_fields` parameter in the
10709        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10710        that are extracted from the temporary views created during the process. These annotation fields
10711        are obtained by querying the temporary views and extracting the column names excluding specific
10712        columns like `#CH
10713        :type annotation_fields: list
10714        :param column_rename: The `column_rename` parameter in the
10715        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10716        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10717        column names to new column names in this dictionary, you can rename specific columns during the
10718        process
10719        :type column_rename: dict
10720        :param column_clean: The `column_clean` parameter in the
10721        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10722        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10723        will be cleaned during the creation of the transcript view based on the specified column format,
10724        defaults to False
10725        :type column_clean: bool (optional)
10726        :param column_case: The `column_case` parameter in the
10727        `create_transcript_view_from_column_format` function is used to specify the case transformation
10728        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10729        to convert the column names to uppercase or lowercase, respectively
10730        :type column_case: str
10731        :return: The `create_transcript_view_from_column_format` function returns two lists:
10732        `temporary_tables` and `annotation_fields`.
10733        """
10734
10735        log.debug("Start transcrpts view creation from column format...")
10736
10737        #  "from_column_format": [
10738        #     {
10739        #         "transcripts_column": "ANN",
10740        #         "transcripts_infos_column": "Feature_ID",
10741        #     }
10742        # ],
10743
10744        # Init
10745        if temporary_tables is None:
10746            temporary_tables = []
10747        if annotation_fields is None:
10748            annotation_fields = []
10749
10750        for column_format in column_formats:
10751
10752            # annotation field and transcript annotation field
10753            annotation_field = column_format.get("transcripts_column", "ANN")
10754            transcript_annotation = column_format.get(
10755                "transcripts_infos_column", "Feature_ID"
10756            )
10757
10758            # Transcripts infos columns rename
10759            column_rename = column_format.get("column_rename", column_rename)
10760
10761            # Transcripts infos columns clean
10762            column_clean = column_format.get("column_clean", column_clean)
10763
10764            # Transcripts infos columns case
10765            column_case = column_format.get("column_case", column_case)
10766
10767            # Temporary View name
10768            temporary_view_name = transcripts_table + "".join(
10769                random.choices(string.ascii_uppercase + string.digits, k=10)
10770            )
10771
10772            # Create temporary view name
10773            temporary_view_name = self.annotation_format_to_table(
10774                uniquify=True,
10775                annotation_field=annotation_field,
10776                view_name=temporary_view_name,
10777                annotation_id=transcript_annotation,
10778                column_rename=column_rename,
10779                column_clean=column_clean,
10780                column_case=column_case,
10781            )
10782
10783            # Annotation fields
10784            if temporary_view_name:
10785                query_annotation_fields = f"""
10786                    SELECT *
10787                    FROM (
10788                        DESCRIBE SELECT *
10789                        FROM {temporary_view_name}
10790                        )
10791                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10792                """
10793                df_annotation_fields = self.get_query_to_df(
10794                    query=query_annotation_fields
10795                )
10796
10797                # Add temporary view and annotation fields
10798                temporary_tables.append(temporary_view_name)
10799                annotation_fields += list(set(df_annotation_fields["column_name"]))
10800
10801        return temporary_tables, annotation_fields
10802
10803    def create_transcript_view(
10804        self,
10805        transcripts_table: str = None,
10806        transcripts_table_drop: bool = False,
10807        param: dict = {},
10808    ) -> str:
10809        """
10810        The `create_transcript_view` function generates a transcript view by processing data from a
10811        specified table based on provided parameters and structural information.
10812
10813        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10814        is used to specify the name of the table that will store the final transcript view data. If a table
10815        name is not provided, the function will create a new table to store the transcript view data, and by
10816        default,, defaults to transcripts
10817        :type transcripts_table: str (optional)
10818        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10819        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10820        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10821        the function will drop the existing transcripts table if it exists, defaults to False
10822        :type transcripts_table_drop: bool (optional)
10823        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10824        contains information needed to create a transcript view. It includes details such as the structure
10825        of the transcripts, columns mapping, column formats, and other necessary information for generating
10826        the view. This parameter allows for flexibility and customization
10827        :type param: dict
10828        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10829        created or modified during the execution of the function.
10830        """
10831
10832        log.debug("Start transcripts view creation...")
10833
10834        # Default
10835        transcripts_table_default = "transcripts"
10836
10837        # Param
10838        if not param:
10839            param = self.get_param()
10840
10841        # Struct
10842        struct = param.get("transcripts", {}).get("struct", None)
10843
10844        # Transcript veresion
10845        transcript_id_remove_version = param.get("transcripts", {}).get(
10846            "transcript_id_remove_version", False
10847        )
10848
10849        # Transcripts mapping
10850        transcript_id_mapping_file = param.get("transcripts", {}).get(
10851            "transcript_id_mapping_file", None
10852        )
10853
10854        # Transcripts mapping
10855        transcript_id_mapping_force = param.get("transcripts", {}).get(
10856            "transcript_id_mapping_force", None
10857        )
10858
10859        if struct:
10860
10861            # Transcripts table
10862            if transcripts_table is None:
10863                transcripts_table = param.get("transcripts", {}).get(
10864                    "table", transcripts_table_default
10865                )
10866
10867            # added_columns
10868            added_columns = []
10869
10870            # Temporary tables
10871            temporary_tables = []
10872
10873            # Annotation fields
10874            annotation_fields = []
10875
10876            # from columns map
10877            columns_maps = struct.get("from_columns_map", [])
10878            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10879                self.create_transcript_view_from_columns_map(
10880                    transcripts_table=transcripts_table,
10881                    columns_maps=columns_maps,
10882                    added_columns=added_columns,
10883                    temporary_tables=temporary_tables,
10884                    annotation_fields=annotation_fields,
10885                )
10886            )
10887            added_columns += added_columns_tmp
10888            temporary_tables += temporary_tables_tmp
10889            annotation_fields += annotation_fields_tmp
10890
10891            # from column format
10892            column_formats = struct.get("from_column_format", [])
10893            temporary_tables_tmp, annotation_fields_tmp = (
10894                self.create_transcript_view_from_column_format(
10895                    transcripts_table=transcripts_table,
10896                    column_formats=column_formats,
10897                    temporary_tables=temporary_tables,
10898                    annotation_fields=annotation_fields,
10899                )
10900            )
10901            temporary_tables += temporary_tables_tmp
10902            annotation_fields += annotation_fields_tmp
10903
10904            # Remove some specific fields/column
10905            annotation_fields = list(set(annotation_fields))
10906            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10907                if field in annotation_fields:
10908                    annotation_fields.remove(field)
10909
10910            # Merge temporary tables query
10911            query_merge = ""
10912            for temporary_table in list(set(temporary_tables)):
10913
10914                # First temporary table
10915                if not query_merge:
10916                    query_merge = f"""
10917                        SELECT * FROM {temporary_table}
10918                    """
10919                # other temporary table (using UNION)
10920                else:
10921                    query_merge += f"""
10922                        UNION BY NAME SELECT * FROM {temporary_table}
10923                    """
10924
10925            # transcript table tmp
10926            transcript_table_tmp = "transcripts_tmp"
10927            transcript_table_tmp2 = "transcripts_tmp2"
10928            transcript_table_tmp3 = "transcripts_tmp3"
10929
10930            # Merge on transcript
10931            query_merge_on_transcripts_annotation_fields = []
10932
10933            # Add transcript list
10934            query_merge_on_transcripts_annotation_fields.append(
10935                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10936            )
10937
10938            # Aggregate all annotations fields
10939            for annotation_field in set(annotation_fields):
10940                query_merge_on_transcripts_annotation_fields.append(
10941                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10942                )
10943
10944            # Transcripts mapping
10945            if transcript_id_mapping_file:
10946
10947                # Transcript dataframe
10948                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10949                transcript_id_mapping_dataframe = transcripts_file_to_df(
10950                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10951                )
10952
10953                # Transcript version remove
10954                if transcript_id_remove_version:
10955                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10956                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10957                    query_left_join = f"""
10958                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10959                    """
10960                else:
10961                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10962                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10963                    query_left_join = f"""
10964                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10965                    """
10966
10967                # Transcript column for group by merge
10968                query_transcript_merge_group_by = """
10969                        CASE
10970                            WHEN transcript_mapped NOT IN ('')
10971                            THEN split_part(transcript_mapped, '.', 1)
10972                            ELSE split_part(transcript_original, '.', 1)
10973                        END
10974                    """
10975
10976                # Merge query
10977                transcripts_tmp2_query = f"""
10978                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10979                    FROM ({query_merge}) AS {transcript_table_tmp}
10980                    {query_left_join}
10981                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10982                """
10983
10984                # Retrive columns after mege
10985                transcripts_tmp2_describe_query = f"""
10986                    DESCRIBE {transcripts_tmp2_query}
10987                """
10988                transcripts_tmp2_describe_list = list(
10989                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10990                        "column_name"
10991                    ]
10992                )
10993
10994                # Create list of columns for select clause
10995                transcripts_tmp2_describe_select_clause = []
10996                for field in transcripts_tmp2_describe_list:
10997                    if field not in [
10998                        "#CHROM",
10999                        "POS",
11000                        "REF",
11001                        "ALT",
11002                        "INFO",
11003                        "transcript_mapped",
11004                    ]:
11005                        as_field = field
11006                        if field in ["transcript_original"]:
11007                            as_field = "transcripts_mapped"
11008                        transcripts_tmp2_describe_select_clause.append(
11009                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11010                        )
11011
11012                # Merge with mapping
11013                query_merge_on_transcripts = f"""
11014                    SELECT
11015                        "#CHROM", POS, REF, ALT, INFO,
11016                        CASE
11017                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11018                            THEN ANY_VALUE(transcript_mapped)
11019                            ELSE ANY_VALUE(transcript_original)
11020                        END AS transcript,
11021                        {", ".join(transcripts_tmp2_describe_select_clause)}
11022                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11023                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11024                        {query_transcript_merge_group_by}
11025                """
11026
11027                # Add transcript filter from mapping file
11028                if transcript_id_mapping_force:
11029                    query_merge_on_transcripts = f"""
11030                        SELECT *
11031                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11032                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11033                    """
11034
11035            # No transcript mapping
11036            else:
11037
11038                # Remove transcript version
11039                if transcript_id_remove_version:
11040                    query_transcript_column = f"""
11041                        split_part({transcript_table_tmp}.transcript, '.', 1)
11042                    """
11043                else:
11044                    query_transcript_column = """
11045                        transcript
11046                    """
11047
11048                # Query sections
11049                query_transcript_column_select = (
11050                    f"{query_transcript_column} AS transcript"
11051                )
11052                query_transcript_column_group_by = query_transcript_column
11053
11054                # Query for transcripts view
11055                query_merge_on_transcripts = f"""
11056                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11057                    FROM ({query_merge}) AS {transcript_table_tmp}
11058                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11059                """
11060
11061            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11062
11063            # Drop transcript view is necessary
11064            if transcripts_table_drop:
11065                query_drop = f"""
11066                    DROP TABLE IF EXISTS {transcripts_table};
11067                """
11068                self.execute_query(query=query_drop)
11069
11070            # Merge and create transcript view
11071            query_create_view = f"""
11072                CREATE TABLE IF NOT EXISTS {transcripts_table}
11073                AS {query_merge_on_transcripts}
11074            """
11075            self.execute_query(query=query_create_view)
11076
11077            # Remove added columns
11078            for added_column in added_columns:
11079                self.drop_column(column=added_column)
11080
11081        else:
11082
11083            transcripts_table = None
11084
11085        return transcripts_table
11086
11087    def annotation_format_to_table(
11088        self,
11089        uniquify: bool = True,
11090        annotation_field: str = "ANN",
11091        annotation_id: str = "Feature_ID",
11092        view_name: str = "transcripts",
11093        column_rename: dict = {},
11094        column_clean: bool = False,
11095        column_case: str = None,
11096    ) -> str:
11097        """
11098        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11099        structured table format, ensuring unique values and creating a temporary table for further
11100        processing or analysis.
11101
11102        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11103        unique values in the output or not. If set to `True`, the function will make sure that the
11104        output values are unique, defaults to True
11105        :type uniquify: bool (optional)
11106        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11107        that contains the annotation information for each variant. This field is used to extract the
11108        annotation details for further processing in the function. By default, it is set to "ANN",
11109        defaults to ANN
11110        :type annotation_field: str (optional)
11111        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11112        is used to specify the identifier for the annotation feature. This identifier will be used as a
11113        column name in the resulting table or view that is created based on the annotation data. It
11114        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11115        :type annotation_id: str (optional)
11116        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11117        to specify the name of the temporary table that will be created to store the transformed
11118        annotation data. This table will hold the extracted information from the annotation field in a
11119        structured format for further processing or analysis. By default,, defaults to transcripts
11120        :type view_name: str (optional)
11121        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11122        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11123        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11124        created based on the annotation data. This feature enables
11125        :type column_rename: dict
11126        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11127        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11128        If set to `True`, the function will clean the annotation field before further processing. This
11129        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11130        to False
11131        :type column_clean: bool (optional)
11132        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11133        used to specify the case transformation to be applied to the column names extracted from the
11134        annotation data. It allows you to set the case of the column names to either lowercase or
11135        uppercase for consistency or other specific requirements during the conversion
11136        :type column_case: str
11137        :return: The function `annotation_format_to_table` is returning the name of the view created,
11138        which is stored in the variable `view_name`.
11139        """
11140
11141        # Annotation field
11142        annotation_format = "annotation_explode"
11143
11144        # Transcript annotation
11145        if column_rename:
11146            annotation_id = column_rename.get(annotation_id, annotation_id)
11147
11148        if column_clean:
11149            annotation_id = clean_annotation_field(annotation_id)
11150
11151        # Prefix
11152        prefix = self.get_explode_infos_prefix()
11153        if prefix:
11154            prefix = "INFO/"
11155
11156        # Annotation fields
11157        annotation_infos = prefix + annotation_field
11158        annotation_format_infos = prefix + annotation_format
11159
11160        # Variants table
11161        table_variants = self.get_table_variants()
11162
11163        # Header
11164        vcf_reader = self.get_header()
11165
11166        # Add columns
11167        added_columns = []
11168
11169        # Explode HGVS field in column
11170        added_columns += self.explode_infos(fields=[annotation_field])
11171
11172        if annotation_field in vcf_reader.infos:
11173
11174            # Extract ANN header
11175            ann_description = vcf_reader.infos[annotation_field].desc
11176            pattern = r"'(.+?)'"
11177            match = re.search(pattern, ann_description)
11178            if match:
11179                ann_header_match = match.group(1).split(" | ")
11180                ann_header = []
11181                ann_header_desc = {}
11182                for i in range(len(ann_header_match)):
11183                    ann_header_info = "".join(
11184                        char for char in ann_header_match[i] if char.isalnum()
11185                    )
11186                    ann_header.append(ann_header_info)
11187                    ann_header_desc[ann_header_info] = ann_header_match[i]
11188                if not ann_header_desc:
11189                    raise ValueError("Invalid header description format")
11190            else:
11191                raise ValueError("Invalid header description format")
11192
11193            # Create variant id
11194            variant_id_column = self.get_variant_id_column()
11195            added_columns += [variant_id_column]
11196
11197            # Create dataframe
11198            dataframe_annotation_format = self.get_query_to_df(
11199                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11200            )
11201
11202            # Create annotation columns
11203            dataframe_annotation_format[
11204                annotation_format_infos
11205            ] = dataframe_annotation_format[annotation_infos].apply(
11206                lambda x: explode_annotation_format(
11207                    annotation=str(x),
11208                    uniquify=uniquify,
11209                    output_format="JSON",
11210                    prefix="",
11211                    header=list(ann_header_desc.values()),
11212                )
11213            )
11214
11215            # Find keys
11216            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11217            df_keys = self.get_query_to_df(query=query_json)
11218
11219            # Check keys
11220            query_json_key = []
11221            for _, row in df_keys.iterrows():
11222
11223                # Key
11224                key = row.iloc[0]
11225                key_clean = key
11226
11227                # key rename
11228                if column_rename:
11229                    key_clean = column_rename.get(key_clean, key_clean)
11230
11231                # key clean
11232                if column_clean:
11233                    key_clean = clean_annotation_field(key_clean)
11234
11235                # Key case
11236                if column_case:
11237                    if column_case.lower() in ["lower"]:
11238                        key_clean = key_clean.lower()
11239                    elif column_case.lower() in ["upper"]:
11240                        key_clean = key_clean.upper()
11241
11242                # Type
11243                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11244
11245                # Get DataFrame from query
11246                df_json_type = self.get_query_to_df(query=query_json_type)
11247
11248                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11249                with pd.option_context("future.no_silent_downcasting", True):
11250                    df_json_type.fillna(value="", inplace=True)
11251                    replace_dict = {None: np.nan, "": np.nan}
11252                    df_json_type.replace(replace_dict, inplace=True)
11253                    df_json_type.dropna(inplace=True)
11254
11255                # Detect column type
11256                column_type = detect_column_type(df_json_type[key_clean])
11257
11258                # Append
11259                query_json_key.append(
11260                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11261                )
11262
11263            # Create view
11264            query_view = f"""
11265                CREATE TEMPORARY TABLE {view_name}
11266                AS (
11267                    SELECT *, {annotation_id} AS 'transcript'
11268                    FROM (
11269                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11270                        FROM dataframe_annotation_format
11271                        )
11272                    );
11273            """
11274            self.execute_query(query=query_view)
11275
11276        else:
11277
11278            # Return None
11279            view_name = None
11280
11281        # Remove added columns
11282        for added_column in added_columns:
11283            self.drop_column(column=added_column)
11284
11285        return view_name
11286
11287    def transcript_view_to_variants(
11288        self,
11289        transcripts_table: str = None,
11290        transcripts_column_id: str = None,
11291        transcripts_info_json: str = None,
11292        transcripts_info_field_json: str = None,
11293        transcripts_info_format: str = None,
11294        transcripts_info_field_format: str = None,
11295        param: dict = {},
11296    ) -> bool:
11297        """
11298        The `transcript_view_to_variants` function updates a variants table with information from
11299        transcripts in JSON format.
11300
11301        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11302        table containing the transcripts data. If this parameter is not provided, the function will
11303        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11304        :type transcripts_table: str
11305        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11306        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11307        identifier is used to match transcripts with variants in the database
11308        :type transcripts_column_id: str
11309        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11310        of the column in the variants table where the transcripts information will be stored in JSON
11311        format. This parameter allows you to define the column in the variants table that will hold the
11312        JSON-formatted information about transcripts
11313        :type transcripts_info_json: str
11314        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11315        specify the field in the VCF header that will contain information about transcripts in JSON
11316        format. This field will be added to the VCF header as an INFO field with the specified name
11317        :type transcripts_info_field_json: str
11318        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11319        format of the information about transcripts that will be stored in the variants table. This
11320        format can be used to define how the transcript information will be structured or displayed
11321        within the variants table
11322        :type transcripts_info_format: str
11323        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11324        specify the field in the VCF header that will contain information about transcripts in a
11325        specific format. This field will be added to the VCF header as an INFO field with the specified
11326        name
11327        :type transcripts_info_field_format: str
11328        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11329        that contains various configuration settings related to transcripts. It is used to provide
11330        default values for certain parameters if they are not explicitly provided when calling the
11331        method. The `param` dictionary can be passed as an argument
11332        :type param: dict
11333        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11334        if the operation is successful and `False` if certain conditions are not met.
11335        """
11336
11337        msg_info_prefix = "Start transcripts view to variants annotations"
11338
11339        log.debug(f"{msg_info_prefix}...")
11340
11341        # Default
11342        transcripts_table_default = "transcripts"
11343        transcripts_column_id_default = "transcript"
11344        transcripts_info_json_default = None
11345        transcripts_info_format_default = None
11346        transcripts_info_field_json_default = None
11347        transcripts_info_field_format_default = None
11348
11349        # Param
11350        if not param:
11351            param = self.get_param()
11352
11353        # Transcripts table
11354        if transcripts_table is None:
11355            transcripts_table = param.get("transcripts", {}).get(
11356                "table", transcripts_table_default
11357            )
11358
11359        # Transcripts column ID
11360        if transcripts_column_id is None:
11361            transcripts_column_id = param.get("transcripts", {}).get(
11362                "column_id", transcripts_column_id_default
11363            )
11364
11365        # Transcripts info json
11366        if transcripts_info_json is None:
11367            transcripts_info_json = param.get("transcripts", {}).get(
11368                "transcripts_info_json", transcripts_info_json_default
11369            )
11370
11371        # Transcripts info field JSON
11372        if transcripts_info_field_json is None:
11373            transcripts_info_field_json = param.get("transcripts", {}).get(
11374                "transcripts_info_field_json", transcripts_info_field_json_default
11375            )
11376        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11377        #     transcripts_info_json = transcripts_info_field_json
11378
11379        # Transcripts info format
11380        if transcripts_info_format is None:
11381            transcripts_info_format = param.get("transcripts", {}).get(
11382                "transcripts_info_format", transcripts_info_format_default
11383            )
11384
11385        # Transcripts info field FORMAT
11386        if transcripts_info_field_format is None:
11387            transcripts_info_field_format = param.get("transcripts", {}).get(
11388                "transcripts_info_field_format", transcripts_info_field_format_default
11389            )
11390        # if (
11391        #     transcripts_info_field_format is not None
11392        #     and transcripts_info_format is None
11393        # ):
11394        #     transcripts_info_format = transcripts_info_field_format
11395
11396        # Variants table
11397        table_variants = self.get_table_variants()
11398
11399        # Check info columns param
11400        if (
11401            transcripts_info_json is None
11402            and transcripts_info_field_json is None
11403            and transcripts_info_format is None
11404            and transcripts_info_field_format is None
11405        ):
11406            return False
11407
11408        # Transcripts infos columns
11409        query_transcripts_infos_columns = f"""
11410            SELECT *
11411            FROM (
11412                DESCRIBE SELECT * FROM {transcripts_table}
11413                )
11414            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11415        """
11416        transcripts_infos_columns = list(
11417            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11418        )
11419
11420        # View results
11421        clause_select = []
11422        clause_to_json = []
11423        clause_to_format = []
11424        for field in transcripts_infos_columns:
11425            # Do not consider INFO field for export into fields
11426            if field not in ["INFO"]:
11427                clause_select.append(
11428                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11429                )
11430                clause_to_json.append(f""" '{field}': "{field}" """)
11431                clause_to_format.append(f""" "{field}" """)
11432
11433        # Update
11434        update_set_json = []
11435        update_set_format = []
11436
11437        # VCF header
11438        vcf_reader = self.get_header()
11439
11440        # Transcripts to info column in JSON
11441        if transcripts_info_json:
11442
11443            # Create column on variants table
11444            self.add_column(
11445                table_name=table_variants,
11446                column_name=transcripts_info_json,
11447                column_type="JSON",
11448                default_value=None,
11449                drop=False,
11450            )
11451
11452            # Add header
11453            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11454                transcripts_info_json,
11455                ".",
11456                "String",
11457                "Transcripts in JSON format",
11458                "unknwon",
11459                "unknwon",
11460                self.code_type_map["String"],
11461            )
11462
11463            # Add to update
11464            update_set_json.append(
11465                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11466            )
11467
11468        # Transcripts to info field in JSON
11469        if transcripts_info_field_json:
11470
11471            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11472
11473            # Add to update
11474            update_set_json.append(
11475                f""" 
11476                    INFO = concat(
11477                            CASE
11478                                WHEN INFO NOT IN ('', '.')
11479                                THEN INFO
11480                                ELSE ''
11481                            END,
11482                            CASE
11483                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11484                                THEN concat(
11485                                    ';{transcripts_info_field_json}=',
11486                                    t.{transcripts_info_json}
11487                                )
11488                                ELSE ''
11489                            END
11490                            )
11491                """
11492            )
11493
11494            # Add header
11495            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11496                transcripts_info_field_json,
11497                ".",
11498                "String",
11499                "Transcripts in JSON format",
11500                "unknwon",
11501                "unknwon",
11502                self.code_type_map["String"],
11503            )
11504
11505        if update_set_json:
11506
11507            # Update query
11508            query_update = f"""
11509                UPDATE {table_variants}
11510                    SET {", ".join(update_set_json)}
11511                FROM
11512                (
11513                    SELECT
11514                        "#CHROM", POS, REF, ALT,
11515                            concat(
11516                            '{{',
11517                            string_agg(
11518                                '"' || "{transcripts_column_id}" || '":' ||
11519                                to_json(json_output)
11520                            ),
11521                            '}}'
11522                            )::JSON AS {transcripts_info_json}
11523                    FROM
11524                        (
11525                        SELECT
11526                            "#CHROM", POS, REF, ALT,
11527                            "{transcripts_column_id}",
11528                            to_json(
11529                                {{{",".join(clause_to_json)}}}
11530                            )::JSON AS json_output
11531                        FROM
11532                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11533                        WHERE "{transcripts_column_id}" IS NOT NULL
11534                        )
11535                    GROUP BY "#CHROM", POS, REF, ALT
11536                ) AS t
11537                WHERE {table_variants}."#CHROM" = t."#CHROM"
11538                    AND {table_variants}."POS" = t."POS"
11539                    AND {table_variants}."REF" = t."REF"
11540                    AND {table_variants}."ALT" = t."ALT"
11541            """
11542
11543            self.execute_query(query=query_update)
11544
11545        # Transcripts to info column in FORMAT
11546        if transcripts_info_format:
11547
11548            # Create column on variants table
11549            self.add_column(
11550                table_name=table_variants,
11551                column_name=transcripts_info_format,
11552                column_type="VARCHAR",
11553                default_value=None,
11554                drop=False,
11555            )
11556
11557            # Add header
11558            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11559                transcripts_info_format,
11560                ".",
11561                "String",
11562                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11563                "unknwon",
11564                "unknwon",
11565                self.code_type_map["String"],
11566            )
11567
11568            # Add to update
11569            update_set_format.append(
11570                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11571            )
11572
11573        else:
11574
11575            # Set variable for internal queries
11576            transcripts_info_format = "transcripts_info_format"
11577
11578        # Transcripts to info field in JSON
11579        if transcripts_info_field_format:
11580
11581            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11582
11583            # Add to update
11584            update_set_format.append(
11585                f""" 
11586                    INFO = concat(
11587                            CASE
11588                                WHEN INFO NOT IN ('', '.')
11589                                THEN INFO
11590                                ELSE ''
11591                            END,
11592                            CASE
11593                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11594                                THEN concat(
11595                                    ';{transcripts_info_field_format}=',
11596                                    t.{transcripts_info_format}
11597                                )
11598                                ELSE ''
11599                            END
11600                            )
11601                """
11602            )
11603
11604            # Add header
11605            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11606                transcripts_info_field_format,
11607                ".",
11608                "String",
11609                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11610                "unknwon",
11611                "unknwon",
11612                self.code_type_map["String"],
11613            )
11614
11615        if update_set_format:
11616
11617            # Update query
11618            query_update = f"""
11619                UPDATE {table_variants}
11620                    SET {", ".join(update_set_format)}
11621                FROM
11622                (
11623                    SELECT
11624                        "#CHROM", POS, REF, ALT,
11625                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11626                    FROM 
11627                        (
11628                        SELECT
11629                            "#CHROM", POS, REF, ALT,
11630                            "{transcripts_column_id}",
11631                            concat(
11632                                "{transcripts_column_id}",
11633                                '|',
11634                                {", '|', ".join(clause_to_format)}
11635                            ) AS {transcripts_info_format}
11636                        FROM
11637                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11638                        )
11639                    GROUP BY "#CHROM", POS, REF, ALT
11640                ) AS t
11641                WHERE {table_variants}."#CHROM" = t."#CHROM"
11642                    AND {table_variants}."POS" = t."POS"
11643                    AND {table_variants}."REF" = t."REF"
11644                    AND {table_variants}."ALT" = t."ALT"
11645            """
11646
11647            self.execute_query(query=query_update)
11648
11649        return True
11650
11651    def rename_info_fields(
11652        self, fields_to_rename: dict = None, table: str = None
11653    ) -> dict:
11654        """
11655        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11656        corresponding INFO fields in the variants table.
11657
11658        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11659        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11660        represent the original field names that need to be renamed, and the corresponding values
11661        represent the new names to which the fields should be
11662        :type fields_to_rename: dict
11663        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11664        the table in which the variants data is stored. This table contains information about genetic
11665        variants, and the function updates the corresponding INFO fields in this table when renaming
11666        specified fields in the VCF file header
11667        :type table: str
11668        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11669        the original field names as keys and their corresponding new names (or None if the field was
11670        removed) as values after renaming or removing specified fields in a VCF file header and updating
11671        corresponding INFO fields in the variants table.
11672        """
11673
11674        # Init
11675        fields_renamed = {}
11676        config = self.get_config()
11677        access = config.get("access")
11678
11679        if table is None:
11680            table = self.get_table_variants()
11681
11682        # regexp replace fonction
11683        regex_replace_dict = {}
11684        regex_replace_nb = 0
11685        regex_replace_partition = 125
11686        regex_replace = "INFO"
11687
11688        if fields_to_rename is not None and access not in ["RO"]:
11689
11690            log.info("Rename or remove fields...")
11691
11692            # Header
11693            header = self.get_header()
11694
11695            for field_to_rename, field_renamed in fields_to_rename.items():
11696
11697                if field_to_rename in header.infos:
11698
11699                    # Rename header
11700                    if field_renamed is not None:
11701                        header.infos[field_renamed] = vcf.parser._Info(
11702                            field_renamed,
11703                            header.infos[field_to_rename].num,
11704                            header.infos[field_to_rename].type,
11705                            header.infos[field_to_rename].desc,
11706                            header.infos[field_to_rename].source,
11707                            header.infos[field_to_rename].version,
11708                            header.infos[field_to_rename].type_code,
11709                        )
11710                    del header.infos[field_to_rename]
11711
11712                    # Rename INFO patterns
11713                    field_pattern = rf'(^|;)({field_to_rename})=([^;]*)'
11714                    if field_renamed is not None:
11715                        field_renamed_pattern = rf'\1{field_renamed}=\3'
11716                    else:
11717                        field_renamed_pattern = ''
11718
11719                    # regexp replace
11720                    regex_replace_nb += 1
11721                    regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition)
11722                    if (regex_replace_nb % regex_replace_partition) == 0:
11723                        regex_replace = "INFO"
11724                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11725                    regex_replace_dict[regex_replace_key] = regex_replace
11726
11727                    # Return
11728                    fields_renamed[field_to_rename] = field_renamed
11729
11730                    # Log
11731                    if field_renamed is not None:
11732                        log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'")
11733                    else:
11734                        log.info(f"Rename or remove fields - field '{field_to_rename}' removed")
11735
11736            # Rename INFO
11737            for regex_replace_key, regex_replace  in regex_replace_dict.items():
11738                log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...")
11739                query = f"""
11740                    UPDATE {table}
11741                    SET
11742                        INFO = {regex_replace}
11743                """
11744                log.debug(f"query={query}")
11745                self.execute_query(query=query)
11746
11747        return fields_renamed
11748
11749    def calculation_rename_info_fields(
11750        self,
11751        fields_to_rename: dict = None,
11752        table: str = None,
11753        operation_name: str = "RENAME_INFO_FIELDS",
11754    ) -> None:
11755        """
11756        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11757        fields to rename and table if provided, and then calls another function to rename the fields.
11758
11759        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11760        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11761        the key and the new field name as the value
11762        :type fields_to_rename: dict
11763        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11764        specify the name of the table for which the fields are to be renamed. It is a string type
11765        parameter
11766        :type table: str
11767        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11768        method is a string that specifies the name of the operation being performed. In this context, it
11769        is used as a default value for the operation name if not explicitly provided when calling the
11770        function, defaults to RENAME_INFO_FIELDS
11771        :type operation_name: str (optional)
11772        """
11773
11774        # Param
11775        param = self.get_param()
11776
11777        # Get param fields to rename
11778        param_fields_to_rename = (
11779            param.get("calculation", {})
11780            .get("calculations", {})
11781            .get(operation_name, {})
11782            .get("fields_to_rename", None)
11783        )
11784
11785        # Get param table
11786        param_table = (
11787            param.get("calculation", {})
11788            .get("calculations", {})
11789            .get(operation_name, {})
11790            .get("table", None)
11791        )
11792
11793        # Init fields_to_rename
11794        if fields_to_rename is None:
11795            fields_to_rename = param_fields_to_rename
11796
11797        # Init table
11798        if table is None:
11799            table = param_table
11800
11801        renamed_fields = self.rename_info_fields(
11802            fields_to_rename=fields_to_rename, table=table
11803        )
11804
11805        log.debug(f"renamed_fields:{renamed_fields}")
class Variants:
   37class Variants:
   38
   39    def __init__(
   40        self,
   41        conn=None,
   42        input: str = None,
   43        output: str = None,
   44        config: dict = {},
   45        param: dict = {},
   46        load: bool = False,
   47    ) -> None:
   48        """
   49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   50        header
   51
   52        :param conn: the connection to the database
   53        :param input: the input file
   54        :param output: the output file
   55        :param config: a dictionary containing the configuration of the model
   56        :param param: a dictionary containing the parameters of the model
   57        """
   58
   59        # Init variables
   60        self.init_variables()
   61
   62        # Input
   63        self.set_input(input)
   64
   65        # Config
   66        self.set_config(config)
   67
   68        # Param
   69        self.set_param(param)
   70
   71        # Output
   72        self.set_output(output)
   73
   74        # connexion
   75        self.set_connexion(conn)
   76
   77        # Header
   78        self.set_header()
   79
   80        # Samples
   81        self.set_samples()
   82
   83        # Load data
   84        if load:
   85            self.load_data()
   86
   87    def set_samples(self, samples: list = None) -> list:
   88        """
   89        The function `set_samples` sets the samples attribute of an object to a provided list or
   90        retrieves it from a parameter dictionary.
   91
   92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   93        input and sets the `samples` attribute of the class to the provided list. If no samples are
   94        provided, it tries to get the samples from the class's parameters using the `get_param` method
   95        :type samples: list
   96        :return: The `samples` list is being returned.
   97        """
   98
   99        if not samples:
  100            samples = self.get_param().get("samples", {}).get("list", None)
  101
  102        self.samples = samples
  103
  104        return samples
  105
  106    def get_samples(self) -> list:
  107        """
  108        This function returns a list of samples.
  109        :return: The `get_samples` method is returning the `samples` attribute of the object.
  110        """
  111
  112        return self.samples
  113
  114    def get_samples_check(self) -> bool:
  115        """
  116        This function returns the value of the "check" key within the "samples" dictionary retrieved
  117        from the parameters.
  118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  120        method. If the key "check" is not found, it will return `False`.
  121        """
  122
  123        return self.get_param().get("samples", {}).get("check", True)
  124
  125    def set_input(self, input: str = None) -> None:
  126        """
  127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  128        attributes in the class accordingly.
  129
  130        :param input: The `set_input` method in the provided code snippet is used to set attributes
  131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  132        :type input: str
  133        """
  134
  135        if input and not isinstance(input, str):
  136            try:
  137                self.input = input.name
  138            except:
  139                log.error(f"Input file '{input} in bad format")
  140                raise ValueError(f"Input file '{input} in bad format")
  141        else:
  142            self.input = input
  143
  144        # Input format
  145        if input:
  146            input_name, input_extension = os.path.splitext(self.input)
  147            self.input_name = input_name
  148            self.input_extension = input_extension
  149            self.input_format = self.input_extension.replace(".", "")
  150
  151    def set_config(self, config: dict) -> None:
  152        """
  153        The set_config function takes a config object and assigns it as the configuration object for the
  154        class.
  155
  156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  157        contains configuration settings for the class. When you call the `set_config` function with a
  158        dictionary object as the argument, it will set that dictionary as the configuration object for
  159        the class
  160        :type config: dict
  161        """
  162
  163        self.config = config
  164
  165    def set_param(self, param: dict) -> None:
  166        """
  167        This function sets a parameter object for the class based on the input dictionary.
  168
  169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  170        as the `param` attribute of the class instance
  171        :type param: dict
  172        """
  173
  174        self.param = param
  175
  176    def init_variables(self) -> None:
  177        """
  178        This function initializes the variables that will be used in the rest of the class
  179        """
  180
  181        self.prefix = "howard"
  182        self.table_variants = "variants"
  183        self.dataframe = None
  184
  185        self.comparison_map = {
  186            "gt": ">",
  187            "gte": ">=",
  188            "lt": "<",
  189            "lte": "<=",
  190            "equals": "=",
  191            "contains": "SIMILAR TO",
  192        }
  193
  194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  195
  196        self.code_type_map_to_sql = {
  197            "Integer": "INTEGER",
  198            "String": "VARCHAR",
  199            "Float": "FLOAT",
  200            "Flag": "VARCHAR",
  201        }
  202
  203        self.index_additionnal_fields = []
  204
  205    def get_indexing(self) -> bool:
  206        """
  207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  208        returns False.
  209        :return: The value of the indexing parameter.
  210        """
  211
  212        return self.get_param().get("indexing", False)
  213
  214    def get_connexion_config(self) -> dict:
  215        """
  216        The function `get_connexion_config` returns a dictionary containing the configuration for a
  217        connection, including the number of threads and memory limit.
  218        :return: a dictionary containing the configuration for the Connexion library.
  219        """
  220
  221        # config
  222        config = self.get_config()
  223
  224        # Connexion config
  225        connexion_config = {}
  226        threads = self.get_threads()
  227
  228        # Threads
  229        if threads:
  230            connexion_config["threads"] = threads
  231
  232        # Memory
  233        # if config.get("memory", None):
  234        #     connexion_config["memory_limit"] = config.get("memory")
  235        if self.get_memory():
  236            connexion_config["memory_limit"] = self.get_memory()
  237
  238        # Temporary directory
  239        if config.get("tmp", None):
  240            connexion_config["temp_directory"] = config.get("tmp")
  241
  242        # Access
  243        if config.get("access", None):
  244            access = config.get("access")
  245            if access in ["RO"]:
  246                access = "READ_ONLY"
  247            elif access in ["RW"]:
  248                access = "READ_WRITE"
  249            connexion_db = self.get_connexion_db()
  250            if connexion_db in ":memory:":
  251                access = "READ_WRITE"
  252            connexion_config["access_mode"] = access
  253
  254        return connexion_config
  255
  256    def get_duckdb_settings(self) -> dict:
  257        """
  258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  259        string.
  260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  261        """
  262
  263        # config
  264        config = self.get_config()
  265
  266        # duckdb settings
  267        duckdb_settings_dict = {}
  268        if config.get("duckdb_settings", None):
  269            duckdb_settings = config.get("duckdb_settings")
  270            duckdb_settings = full_path(duckdb_settings)
  271            # duckdb setting is a file
  272            if os.path.exists(duckdb_settings):
  273                with open(duckdb_settings) as json_file:
  274                    duckdb_settings_dict = yaml.safe_load(json_file)
  275            # duckdb settings is a string
  276            else:
  277                duckdb_settings_dict = json.loads(duckdb_settings)
  278
  279        return duckdb_settings_dict
  280
  281    def set_connexion_db(self) -> str:
  282        """
  283        The function `set_connexion_db` returns the appropriate database connection string based on the
  284        input format and connection type.
  285        :return: the value of the variable `connexion_db`.
  286        """
  287
  288        # Default connexion db
  289        default_connexion_db = ":memory:"
  290
  291        # Find connexion db
  292        if self.get_input_format() in ["db", "duckdb"]:
  293            connexion_db = self.get_input()
  294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  295            connexion_db = default_connexion_db
  296        elif self.get_connexion_type() in ["tmpfile"]:
  297            tmp_name = tempfile.mkdtemp(
  298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  299            )
  300            connexion_db = f"{tmp_name}/tmp.db"
  301        elif self.get_connexion_type() != "":
  302            connexion_db = self.get_connexion_type()
  303        else:
  304            connexion_db = default_connexion_db
  305
  306        # Set connexion db
  307        self.connexion_db = connexion_db
  308
  309        return connexion_db
  310
  311    def set_connexion(self, conn) -> None:
  312        """
  313        The function `set_connexion` creates a connection to a database, with options for different
  314        database formats and settings.
  315
  316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  317        database. If a connection is not provided, a new connection to an in-memory database is created.
  318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  319        sqlite
  320        """
  321
  322        # Connexion db
  323        connexion_db = self.set_connexion_db()
  324
  325        # Connexion config
  326        connexion_config = self.get_connexion_config()
  327
  328        # Connexion format
  329        connexion_format = self.get_config().get("connexion_format", "duckdb")
  330        # Set connexion format
  331        self.connexion_format = connexion_format
  332
  333        # Connexion
  334        if not conn:
  335            if connexion_format in ["duckdb"]:
  336                conn = duckdb.connect(connexion_db, config=connexion_config)
  337                # duckDB settings
  338                duckdb_settings = self.get_duckdb_settings()
  339                if duckdb_settings:
  340                    for setting in duckdb_settings:
  341                        setting_value = duckdb_settings.get(setting)
  342                        if isinstance(setting_value, str):
  343                            setting_value = f"'{setting_value}'"
  344                        conn.execute(f"PRAGMA {setting}={setting_value};")
  345            elif connexion_format in ["sqlite"]:
  346                conn = sqlite3.connect(connexion_db)
  347
  348        # Set connexion
  349        self.conn = conn
  350
  351        # Log
  352        log.debug(f"connexion_format: {connexion_format}")
  353        log.debug(f"connexion_db: {connexion_db}")
  354        log.debug(f"connexion config: {connexion_config}")
  355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  356
  357    def set_output(self, output: str = None) -> None:
  358        """
  359        The `set_output` function in Python sets the output file based on the input or a specified key
  360        in the config file, extracting the output name, extension, and format.
  361
  362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  363        the output file. If the config file has an 'output' key, the method sets the output to the value
  364        of that key. If no output is provided, it sets the output to `None`
  365        :type output: str
  366        """
  367
  368        if output and not isinstance(output, str):
  369            self.output = output.name
  370        else:
  371            self.output = output
  372
  373        # Output format
  374        if self.output:
  375            output_name, output_extension = os.path.splitext(self.output)
  376            self.output_name = output_name
  377            self.output_extension = output_extension
  378            self.output_format = self.output_extension.replace(".", "")
  379        else:
  380            self.output_name = None
  381            self.output_extension = None
  382            self.output_format = None
  383
  384    def set_header(self) -> None:
  385        """
  386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  387        """
  388
  389        input_file = self.get_input()
  390        default_header_list = [
  391            "##fileformat=VCFv4.2",
  392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  393        ]
  394
  395        # Full path
  396        input_file = full_path(input_file)
  397
  398        if input_file:
  399
  400            input_format = self.get_input_format()
  401            input_compressed = self.get_input_compressed()
  402            config = self.get_config()
  403            header_list = default_header_list
  404            if input_format in [
  405                "vcf",
  406                "hdr",
  407                "tsv",
  408                "csv",
  409                "psv",
  410                "parquet",
  411                "db",
  412                "duckdb",
  413            ]:
  414                # header provided in param
  415                if config.get("header_file", None):
  416                    with open(config.get("header_file"), "rt") as f:
  417                        header_list = self.read_vcf_header(f)
  418                # within a vcf file format (header within input file itsself)
  419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  420                    # within a compressed vcf file format (.vcf.gz)
  421                    if input_compressed:
  422                        with bgzf.open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                    # within an uncompressed vcf file format (.vcf)
  425                    else:
  426                        with open(input_file, "rt") as f:
  427                            header_list = self.read_vcf_header(f)
  428                # header provided in default external file .hdr
  429                elif os.path.exists((input_file + ".hdr")):
  430                    with open(input_file + ".hdr", "rt") as f:
  431                        header_list = self.read_vcf_header(f)
  432                else:
  433                    try:  # Try to get header info fields and file columns
  434
  435                        with tempfile.TemporaryDirectory() as tmpdir:
  436
  437                            # Create database
  438                            db_for_header = Database(database=input_file)
  439
  440                            # Get header columns for infos fields
  441                            db_header_from_columns = (
  442                                db_for_header.get_header_from_columns()
  443                            )
  444
  445                            # Get real columns in the file
  446                            db_header_columns = db_for_header.get_columns()
  447
  448                            # Write header file
  449                            header_file_tmp = os.path.join(tmpdir, "header")
  450                            f = open(header_file_tmp, "w")
  451                            vcf.Writer(f, db_header_from_columns)
  452                            f.close()
  453
  454                            # Replace #CHROM line with rel columns
  455                            header_list = db_for_header.read_header_file(
  456                                header_file=header_file_tmp
  457                            )
  458                            header_list[-1] = "\t".join(db_header_columns)
  459
  460                    except:
  461
  462                        log.warning(
  463                            f"No header for file {input_file}. Set as default VCF header"
  464                        )
  465                        header_list = default_header_list
  466
  467            else:  # try for unknown format ?
  468
  469                log.error(f"Input file format '{input_format}' not available")
  470                raise ValueError(f"Input file format '{input_format}' not available")
  471
  472            if not header_list:
  473                header_list = default_header_list
  474
  475            # header as list
  476            self.header_list = header_list
  477
  478            # header as VCF object
  479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  480
  481        else:
  482
  483            self.header_list = None
  484            self.header_vcf = None
  485
  486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  487        """
  488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  489        DataFrame based on the connection format.
  490
  491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  492        represents the SQL query you want to execute. This query will be used to fetch data from a
  493        database and convert it into a pandas DataFrame
  494        :type query: str
  495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  497        function will only fetch up to that number of rows from the database query result. If no limit
  498        is specified,
  499        :type limit: int
  500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  501        """
  502
  503        # Connexion format
  504        connexion_format = self.get_connexion_format()
  505
  506        # Limit in query
  507        if limit:
  508            pd.set_option("display.max_rows", limit)
  509            if connexion_format in ["duckdb"]:
  510                df = (
  511                    self.conn.execute(query)
  512                    .fetch_record_batch(limit)
  513                    .read_next_batch()
  514                    .to_pandas()
  515                )
  516            elif connexion_format in ["sqlite"]:
  517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  518
  519        # Full query
  520        else:
  521            if connexion_format in ["duckdb"]:
  522                df = self.conn.execute(query).df()
  523            elif connexion_format in ["sqlite"]:
  524                df = pd.read_sql_query(query, self.conn)
  525
  526        return df
  527
  528    def get_overview(self) -> None:
  529        """
  530        The function prints the input, output, config, and dataframe of the current object
  531        """
  532        table_variants_from = self.get_table_variants(clause="from")
  533        sql_columns = self.get_header_columns_as_sql()
  534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  535        df = self.get_query_to_df(sql_query_export)
  536        log.info(
  537            "Input:  "
  538            + str(self.get_input())
  539            + " ["
  540            + str(str(self.get_input_format()))
  541            + "]"
  542        )
  543        log.info(
  544            "Output: "
  545            + str(self.get_output())
  546            + " ["
  547            + str(str(self.get_output_format()))
  548            + "]"
  549        )
  550        log.info("Config: ")
  551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  552            "\n"
  553        ):
  554            log.info("\t" + str(d))
  555        log.info("Param: ")
  556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  557            "\n"
  558        ):
  559            log.info("\t" + str(d))
  560        log.info("Sample list: " + str(self.get_header_sample_list()))
  561        log.info("Dataframe: ")
  562        for d in str(df).split("\n"):
  563            log.info("\t" + str(d))
  564
  565        # garbage collector
  566        del df
  567        gc.collect()
  568
  569        return None
  570
  571    def get_stats(self) -> dict:
  572        """
  573        The `get_stats` function calculates and returns various statistics of the current object,
  574        including information about the input file, variants, samples, header fields, quality, and
  575        SNVs/InDels.
  576        :return: a dictionary containing various statistics of the current object. The dictionary has
  577        the following structure:
  578        """
  579
  580        # Log
  581        log.info(f"Stats Calculation...")
  582
  583        # table varaints
  584        table_variants_from = self.get_table_variants()
  585
  586        # stats dict
  587        stats = {"Infos": {}}
  588
  589        ### File
  590        input_file = self.get_input()
  591        stats["Infos"]["Input file"] = input_file
  592
  593        # Header
  594        header_infos = self.get_header().infos
  595        header_formats = self.get_header().formats
  596        header_infos_list = list(header_infos)
  597        header_formats_list = list(header_formats)
  598
  599        ### Variants
  600
  601        stats["Variants"] = {}
  602
  603        # Variants by chr
  604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  607            by=["CHROM"], kind="quicksort"
  608        )
  609
  610        # Total number of variants
  611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  612
  613        # Calculate percentage
  614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  615            lambda x: (x / nb_of_variants)
  616        )
  617
  618        stats["Variants"]["Number of variants by chromosome"] = (
  619            nb_of_variants_by_chrom.to_dict(orient="index")
  620        )
  621
  622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  623
  624        ### Samples
  625
  626        # Init
  627        samples = {}
  628        nb_of_samples = 0
  629
  630        # Check Samples
  631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  632            log.debug(f"Check samples...")
  633            for sample in self.get_header_sample_list():
  634                sql_query_samples = f"""
  635                    SELECT  '{sample}' as sample,
  636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  639                    FROM {table_variants_from}
  640                    WHERE (
  641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  642                        AND
  643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  644                      )
  645                    GROUP BY genotype
  646                    """
  647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  648                sample_genotype_count = sql_query_genotype_df["count"].sum()
  649                if len(sql_query_genotype_df):
  650                    nb_of_samples += 1
  651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  652                        sql_query_genotype_df.to_dict(orient="index")
  653                    )
  654
  655            stats["Samples"] = samples
  656            stats["Infos"]["Number of samples"] = nb_of_samples
  657
  658        # #
  659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  660        #     stats["Infos"]["Number of samples"] = nb_of_samples
  661        # elif nb_of_samples:
  662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  663
  664        ### INFO and FORMAT fields
  665        header_types_df = {}
  666        header_types_list = {
  667            "List of INFO fields": header_infos,
  668            "List of FORMAT fields": header_formats,
  669        }
  670        i = 0
  671        for header_type in header_types_list:
  672
  673            header_type_infos = header_types_list.get(header_type)
  674            header_infos_dict = {}
  675
  676            for info in header_type_infos:
  677
  678                i += 1
  679                header_infos_dict[i] = {}
  680
  681                # ID
  682                header_infos_dict[i]["id"] = info
  683
  684                # num
  685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  686                if header_type_infos[info].num in genotype_map.keys():
  687                    header_infos_dict[i]["Number"] = genotype_map.get(
  688                        header_type_infos[info].num
  689                    )
  690                else:
  691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  692
  693                # type
  694                if header_type_infos[info].type:
  695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  696                else:
  697                    header_infos_dict[i]["Type"] = "."
  698
  699                # desc
  700                if header_type_infos[info].desc != None:
  701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  702                else:
  703                    header_infos_dict[i]["Description"] = ""
  704
  705            if len(header_infos_dict):
  706                header_types_df[header_type] = pd.DataFrame.from_dict(
  707                    header_infos_dict, orient="index"
  708                ).to_dict(orient="index")
  709
  710        # Stats
  711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  713        stats["Header"] = header_types_df
  714
  715        ### QUAL
  716        if "QUAL" in self.get_header_columns():
  717            sql_query_qual = f"""
  718                    SELECT
  719                        avg(CAST(QUAL AS INTEGER)) AS Average,
  720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  723                        median(CAST(QUAL AS INTEGER)) AS Median,
  724                        variance(CAST(QUAL AS INTEGER)) AS Variance
  725                    FROM {table_variants_from}
  726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  727                    """
  728
  729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  730            stats["Quality"] = {"Stats": qual}
  731
  732        ### SNV and InDel
  733
  734        sql_query_snv = f"""
  735            
  736            SELECT Type, count FROM (
  737
  738                    SELECT
  739                        'Total' AS Type,
  740                        count(*) AS count
  741                    FROM {table_variants_from}
  742
  743                    UNION
  744
  745                    SELECT
  746                        'MNV' AS Type,
  747                        count(*) AS count
  748                    FROM {table_variants_from}
  749                    WHERE len(REF) > 1 AND len(ALT) > 1
  750                    AND len(REF) = len(ALT)
  751
  752                    UNION
  753
  754                    SELECT
  755                        'InDel' AS Type,
  756                        count(*) AS count
  757                    FROM {table_variants_from}
  758                    WHERE len(REF) > 1 OR len(ALT) > 1
  759                    AND len(REF) != len(ALT)
  760                    
  761                    UNION
  762
  763                    SELECT
  764                        'SNV' AS Type,
  765                        count(*) AS count
  766                    FROM {table_variants_from}
  767                    WHERE len(REF) = 1 AND len(ALT) = 1
  768
  769                )
  770
  771            ORDER BY count DESC
  772
  773                """
  774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  775
  776        sql_query_snv_substitution = f"""
  777                SELECT
  778                    concat(REF, '>', ALT) AS 'Substitution',
  779                    count(*) AS count
  780                FROM {table_variants_from}
  781                WHERE len(REF) = 1 AND len(ALT) = 1
  782                GROUP BY REF, ALT
  783                ORDER BY count(*) DESC
  784                """
  785        snv_substitution = (
  786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  787        )
  788        stats["Variants"]["Counts"] = snv_indel
  789        stats["Variants"]["Substitutions"] = snv_substitution
  790
  791        return stats
  792
  793    def stats_to_file(self, file: str = None) -> str:
  794        """
  795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  796        into a JSON object, and writes the JSON object to the specified file.
  797
  798        :param file: The `file` parameter is a string that represents the file path where the JSON data
  799        will be written
  800        :type file: str
  801        :return: the name of the file that was written to.
  802        """
  803
  804        # Get stats
  805        stats = self.get_stats()
  806
  807        # Serializing json
  808        json_object = json.dumps(stats, indent=4)
  809
  810        # Writing to sample.json
  811        with open(file, "w") as outfile:
  812            outfile.write(json_object)
  813
  814        return file
  815
  816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  817        """
  818        The `print_stats` function generates a markdown file and prints the statistics contained in a
  819        JSON file in a formatted manner.
  820
  821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  823        provided, a temporary directory will be created and the stats will be saved in a file named
  824        "stats.md" within that
  825        :type output_file: str
  826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  827        file where the statistics will be saved. If no value is provided, a temporary directory will be
  828        created and a default file name "stats.json" will be used
  829        :type json_file: str
  830        :return: The function `print_stats` does not return any value. It has a return type annotation
  831        of `None`.
  832        """
  833
  834        # Full path
  835        output_file = full_path(output_file)
  836        json_file = full_path(json_file)
  837
  838        with tempfile.TemporaryDirectory() as tmpdir:
  839
  840            # Files
  841            if not output_file:
  842                output_file = os.path.join(tmpdir, "stats.md")
  843            if not json_file:
  844                json_file = os.path.join(tmpdir, "stats.json")
  845
  846            # Create folders
  847            if not os.path.exists(os.path.dirname(output_file)):
  848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  849            if not os.path.exists(os.path.dirname(json_file)):
  850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  851
  852            # Create stats JSON file
  853            stats_file = self.stats_to_file(file=json_file)
  854
  855            # Print stats file
  856            with open(stats_file) as f:
  857                stats = yaml.safe_load(f)
  858
  859            # Output
  860            output_title = []
  861            output_index = []
  862            output = []
  863
  864            # Title
  865            output_title.append("# HOWARD Stats")
  866
  867            # Index
  868            output_index.append("## Index")
  869
  870            # Process sections
  871            for section in stats:
  872                infos = stats.get(section)
  873                section_link = "#" + section.lower().replace(" ", "-")
  874                output.append(f"## {section}")
  875                output_index.append(f"- [{section}]({section_link})")
  876
  877                if len(infos):
  878                    for info in infos:
  879                        try:
  880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  881                            is_df = True
  882                        except:
  883                            try:
  884                                df = pd.DataFrame.from_dict(
  885                                    json.loads((infos.get(info))), orient="index"
  886                                )
  887                                is_df = True
  888                            except:
  889                                is_df = False
  890                        if is_df:
  891                            output.append(f"### {info}")
  892                            info_link = "#" + info.lower().replace(" ", "-")
  893                            output_index.append(f"   - [{info}]({info_link})")
  894                            output.append(f"{df.to_markdown(index=False)}")
  895                        else:
  896                            output.append(f"- {info}: {infos.get(info)}")
  897                else:
  898                    output.append(f"NA")
  899
  900            # Write stats in markdown file
  901            with open(output_file, "w") as fp:
  902                for item in output_title:
  903                    fp.write("%s\n" % item)
  904                for item in output_index:
  905                    fp.write("%s\n" % item)
  906                for item in output:
  907                    fp.write("%s\n" % item)
  908
  909            # Output stats in markdown
  910            print("")
  911            print("\n\n".join(output_title))
  912            print("")
  913            print("\n\n".join(output))
  914            print("")
  915
  916        return None
  917
  918    def get_input(self) -> str:
  919        """
  920        It returns the value of the input variable.
  921        :return: The input is being returned.
  922        """
  923        return self.input
  924
  925    def get_input_format(self, input_file: str = None) -> str:
  926        """
  927        This function returns the format of the input variable, either from the provided input file or
  928        by prompting for input.
  929
  930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  931        represents the file path of the input file. If no `input_file` is provided when calling the
  932        method, it will default to `None`
  933        :type input_file: str
  934        :return: The format of the input variable is being returned.
  935        """
  936
  937        if not input_file:
  938            input_file = self.get_input()
  939        input_format = get_file_format(input_file)
  940        return input_format
  941
  942    def get_input_compressed(self, input_file: str = None) -> str:
  943        """
  944        The function `get_input_compressed` returns the format of the input variable after compressing
  945        it.
  946
  947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  948        that represents the file path of the input file. If no `input_file` is provided when calling the
  949        method, it will default to `None` and the method will then call `self.get_input()` to
  950        :type input_file: str
  951        :return: The function `get_input_compressed` returns the compressed format of the input
  952        variable.
  953        """
  954
  955        if not input_file:
  956            input_file = self.get_input()
  957        input_compressed = get_file_compressed(input_file)
  958        return input_compressed
  959
  960    def get_output(self) -> str:
  961        """
  962        It returns the output of the neuron.
  963        :return: The output of the neural network.
  964        """
  965
  966        return self.output
  967
  968    def get_output_format(self, output_file: str = None) -> str:
  969        """
  970        The function `get_output_format` returns the format of the input variable or the output file if
  971        provided.
  972
  973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  974        that represents the file path of the output file. If no `output_file` is provided when calling
  975        the method, it will default to the output obtained from the `get_output` method of the class
  976        instance. The
  977        :type output_file: str
  978        :return: The format of the input variable is being returned.
  979        """
  980
  981        if not output_file:
  982            output_file = self.get_output()
  983        output_format = get_file_format(output_file)
  984
  985        return output_format
  986
  987    def get_config(self) -> dict:
  988        """
  989        It returns the config
  990        :return: The config variable is being returned.
  991        """
  992        return self.config
  993
  994    def get_param(self) -> dict:
  995        """
  996        It returns the param
  997        :return: The param variable is being returned.
  998        """
  999        return self.param
 1000
 1001    def get_connexion_db(self) -> str:
 1002        """
 1003        It returns the connexion_db attribute of the object
 1004        :return: The connexion_db is being returned.
 1005        """
 1006        return self.connexion_db
 1007
 1008    def get_prefix(self) -> str:
 1009        """
 1010        It returns the prefix of the object.
 1011        :return: The prefix is being returned.
 1012        """
 1013        return self.prefix
 1014
 1015    def get_table_variants(self, clause: str = "select") -> str:
 1016        """
 1017        This function returns the table_variants attribute of the object
 1018
 1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1020        defaults to select (optional)
 1021        :return: The table_variants attribute of the object.
 1022        """
 1023
 1024        # Access
 1025        access = self.get_config().get("access", None)
 1026
 1027        # Clauses "select", "where", "update"
 1028        if clause in ["select", "where", "update"]:
 1029            table_variants = self.table_variants
 1030        # Clause "from"
 1031        elif clause in ["from"]:
 1032            # For Read Only
 1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1034                input_file = self.get_input()
 1035                table_variants = f"'{input_file}' as variants"
 1036            # For Read Write
 1037            else:
 1038                table_variants = f"{self.table_variants} as variants"
 1039        else:
 1040            table_variants = self.table_variants
 1041        return table_variants
 1042
 1043    def get_tmp_dir(self) -> str:
 1044        """
 1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1046        parameters or a default path.
 1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1048        configuration, parameters, and a default value of "/tmp".
 1049        """
 1050
 1051        return get_tmp(
 1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1053        )
 1054
 1055    def get_connexion_type(self) -> str:
 1056        """
 1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1058
 1059        :return: The connexion type is being returned.
 1060        """
 1061        return self.get_config().get("connexion_type", "memory")
 1062
 1063    def get_connexion(self):
 1064        """
 1065        It returns the connection object
 1066
 1067        :return: The connection object.
 1068        """
 1069        return self.conn
 1070
 1071    def close_connexion(self) -> None:
 1072        """
 1073        This function closes the connection to the database.
 1074        :return: The connection is being closed.
 1075        """
 1076        return self.conn.close()
 1077
 1078    def get_header(self, type: str = "vcf"):
 1079        """
 1080        This function returns the header of the VCF file as a list of strings
 1081
 1082        :param type: the type of header you want to get, defaults to vcf (optional)
 1083        :return: The header of the vcf file.
 1084        """
 1085
 1086        if self.header_vcf:
 1087            if type == "vcf":
 1088                return self.header_vcf
 1089            elif type == "list":
 1090                return self.header_list
 1091        else:
 1092            if type == "vcf":
 1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1094                return header
 1095            elif type == "list":
 1096                return vcf_required
 1097
 1098    def get_header_infos_list(self) -> list:
 1099        """
 1100        This function retrieves a list of information fields from the header.
 1101        :return: A list of information fields from the header.
 1102        """
 1103
 1104        # Init
 1105        infos_list = []
 1106
 1107        for field in self.get_header().infos:
 1108            infos_list.append(field)
 1109
 1110        return infos_list
 1111
 1112    def get_header_length(self, file: str = None) -> int:
 1113        """
 1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1115        line.
 1116
 1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1118        header file. If this argument is provided, the function will read the header from the specified
 1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1120        :type file: str
 1121        :return: the length of the header list, excluding the #CHROM line.
 1122        """
 1123
 1124        if file:
 1125            return len(self.read_vcf_header_file(file=file)) - 1
 1126        elif self.get_header(type="list"):
 1127            return len(self.get_header(type="list")) - 1
 1128        else:
 1129            return 0
 1130
 1131    def get_header_columns(self) -> str:
 1132        """
 1133        This function returns the header list of a VCF
 1134
 1135        :return: The length of the header list.
 1136        """
 1137        if self.get_header():
 1138            return self.get_header(type="list")[-1]
 1139        else:
 1140            return ""
 1141
 1142    def get_header_columns_as_list(self) -> list:
 1143        """
 1144        This function returns the header list of a VCF
 1145
 1146        :return: The length of the header list.
 1147        """
 1148        if self.get_header():
 1149            return self.get_header_columns().strip().split("\t")
 1150        else:
 1151            return []
 1152
 1153    def get_header_columns_as_sql(self) -> str:
 1154        """
 1155        This function retruns header length (without #CHROM line)
 1156
 1157        :return: The length of the header list.
 1158        """
 1159        sql_column_list = []
 1160        for col in self.get_header_columns_as_list():
 1161            sql_column_list.append(f'"{col}"')
 1162        return ",".join(sql_column_list)
 1163
 1164    def get_header_sample_list(
 1165        self, check: bool = False, samples: list = None, samples_force: bool = False
 1166    ) -> list:
 1167        """
 1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1169        checking and filtering based on input parameters.
 1170
 1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1172        parameter that determines whether to check if the samples in the list are properly defined as
 1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1174        list is defined as a, defaults to False
 1175        :type check: bool (optional)
 1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1177        allows you to specify a subset of samples from the header. If you provide a list of sample
 1178        names, the function will check if each sample is defined in the header. If a sample is not found
 1179        in the
 1180        :type samples: list
 1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1182        a boolean parameter that determines whether to force the function to return the sample list
 1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1184        function will return the sample list without performing, defaults to False
 1185        :type samples_force: bool (optional)
 1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1187        parameters and conditions specified in the function.
 1188        """
 1189
 1190        # Init
 1191        samples_list = []
 1192
 1193        if samples is None:
 1194            samples_list = self.header_vcf.samples
 1195        else:
 1196            samples_checked = []
 1197            for sample in samples:
 1198                if sample in self.header_vcf.samples:
 1199                    samples_checked.append(sample)
 1200                else:
 1201                    log.warning(f"Sample '{sample}' not defined in header")
 1202            samples_list = samples_checked
 1203
 1204            # Force sample list without checking if is_genotype_column
 1205            if samples_force:
 1206                log.warning(f"Samples {samples_list} not checked if genotypes")
 1207                return samples_list
 1208
 1209        if check:
 1210            samples_checked = []
 1211            for sample in samples_list:
 1212                if self.is_genotype_column(column=sample):
 1213                    samples_checked.append(sample)
 1214                else:
 1215                    log.warning(
 1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1217                    )
 1218            samples_list = samples_checked
 1219
 1220        # Return samples list
 1221        return samples_list
 1222
 1223    def is_genotype_column(self, column: str = None) -> bool:
 1224        """
 1225        This function checks if a given column is a genotype column in a database.
 1226
 1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1228        represents the column name in a database table. This method checks if the specified column is a
 1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1230        method of
 1231        :type column: str
 1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1234        column name and returns the result. If the `column` parameter is None, it returns False.
 1235        """
 1236
 1237        if column is not None:
 1238            return Database(database=self.get_input()).is_genotype_column(column=column)
 1239        else:
 1240            return False
 1241
 1242    def get_verbose(self) -> bool:
 1243        """
 1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1245        exist
 1246
 1247        :return: The value of the key "verbose" in the config dictionary.
 1248        """
 1249        return self.get_config().get("verbose", False)
 1250
 1251    def get_connexion_format(self) -> str:
 1252        """
 1253        It returns the connexion format of the object.
 1254        :return: The connexion_format is being returned.
 1255        """
 1256        connexion_format = self.connexion_format
 1257        if connexion_format not in ["duckdb", "sqlite"]:
 1258            log.error(f"Unknown connexion format {connexion_format}")
 1259            raise ValueError(f"Unknown connexion format {connexion_format}")
 1260        else:
 1261            return connexion_format
 1262
 1263    def insert_file_to_table(
 1264        self,
 1265        file,
 1266        columns: str,
 1267        header_len: int = 0,
 1268        sep: str = "\t",
 1269        chunksize: int = 1000000,
 1270    ) -> None:
 1271        """
 1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1273        database format.
 1274
 1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1276        the path to the file on your system
 1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1278        should contain the names of the columns in the table where the data will be inserted. The column
 1279        names should be separated by commas within the string. For example, if you have columns named
 1280        "id", "name
 1281        :type columns: str
 1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1283        the number of lines to skip at the beginning of the file before reading the actual data. This
 1284        parameter allows you to skip any header information present in the file before processing the
 1285        data, defaults to 0
 1286        :type header_len: int (optional)
 1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1288        separator character that is used in the file being read. In this case, the default separator is
 1289        set to `\t`, which represents a tab character. You can change this parameter to a different
 1290        separator character if, defaults to \t
 1291        :type sep: str (optional)
 1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1293        when processing the file in chunks. In the provided code snippet, the default value for
 1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1295        to 1000000
 1296        :type chunksize: int (optional)
 1297        """
 1298
 1299        # Config
 1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1301        connexion_format = self.get_connexion_format()
 1302
 1303        log.debug("chunksize: " + str(chunksize))
 1304
 1305        if chunksize:
 1306            for chunk in pd.read_csv(
 1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1308            ):
 1309                if connexion_format in ["duckdb"]:
 1310                    sql_insert_into = (
 1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1312                    )
 1313                    self.conn.execute(sql_insert_into)
 1314                elif connexion_format in ["sqlite"]:
 1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1316
 1317    def load_data(
 1318        self,
 1319        input_file: str = None,
 1320        drop_variants_table: bool = False,
 1321        sample_size: int = 20480,
 1322    ) -> None:
 1323        """
 1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1325        table before loading the data and specify a sample size.
 1326
 1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1328        table
 1329        :type input_file: str
 1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1331        determines whether the variants table should be dropped before loading the data. If set to
 1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1333        not be dropped, defaults to False
 1334        :type drop_variants_table: bool (optional)
 1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1337        20480
 1338        :type sample_size: int (optional)
 1339        """
 1340
 1341        log.info("Loading...")
 1342
 1343        # change input file
 1344        if input_file:
 1345            self.set_input(input_file)
 1346            self.set_header()
 1347
 1348        # drop variants table
 1349        if drop_variants_table:
 1350            self.drop_variants_table()
 1351
 1352        # get table variants
 1353        table_variants = self.get_table_variants()
 1354
 1355        # Access
 1356        access = self.get_config().get("access", None)
 1357        log.debug(f"access: {access}")
 1358
 1359        # Input format and compress
 1360        input_format = self.get_input_format()
 1361        input_compressed = self.get_input_compressed()
 1362        log.debug(f"input_format: {input_format}")
 1363        log.debug(f"input_compressed: {input_compressed}")
 1364
 1365        # input_compressed_format
 1366        if input_compressed:
 1367            input_compressed_format = "gzip"
 1368        else:
 1369            input_compressed_format = "none"
 1370        log.debug(f"input_compressed_format: {input_compressed_format}")
 1371
 1372        # Connexion format
 1373        connexion_format = self.get_connexion_format()
 1374
 1375        # Sample size
 1376        if not sample_size:
 1377            sample_size = -1
 1378        log.debug(f"sample_size: {sample_size}")
 1379
 1380        # Load data
 1381        log.debug(f"Load Data from {input_format}")
 1382
 1383        # DuckDB connexion
 1384        if connexion_format in ["duckdb"]:
 1385
 1386            # Database already exists
 1387            if self.input_format in ["db", "duckdb"]:
 1388
 1389                if connexion_format in ["duckdb"]:
 1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1391                else:
 1392                    log.error(
 1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1394                    )
 1395                    raise ValueError(
 1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1397                    )
 1398
 1399            # Load from existing database format
 1400            else:
 1401
 1402                try:
 1403                    # Create Table or View
 1404                    database = Database(database=self.input)
 1405                    sql_from = database.get_sql_from(sample_size=sample_size)
 1406
 1407                    if access in ["RO"]:
 1408                        sql_load = (
 1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1410                        )
 1411                    else:
 1412                        sql_load = (
 1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1414                        )
 1415                    self.conn.execute(sql_load)
 1416
 1417                except:
 1418                    # Format not available
 1419                    log.error(f"Input file format '{self.input_format}' not available")
 1420                    raise ValueError(
 1421                        f"Input file format '{self.input_format}' not available"
 1422                    )
 1423
 1424        # SQLite connexion
 1425        elif connexion_format in ["sqlite"] and input_format in [
 1426            "vcf",
 1427            "tsv",
 1428            "csv",
 1429            "psv",
 1430        ]:
 1431
 1432            # Main structure
 1433            structure = {
 1434                "#CHROM": "VARCHAR",
 1435                "POS": "INTEGER",
 1436                "ID": "VARCHAR",
 1437                "REF": "VARCHAR",
 1438                "ALT": "VARCHAR",
 1439                "QUAL": "VARCHAR",
 1440                "FILTER": "VARCHAR",
 1441                "INFO": "VARCHAR",
 1442            }
 1443
 1444            # Strcuture with samples
 1445            structure_complete = structure
 1446            if self.get_header_sample_list():
 1447                structure["FORMAT"] = "VARCHAR"
 1448                for sample in self.get_header_sample_list():
 1449                    structure_complete[sample] = "VARCHAR"
 1450
 1451            # Columns list for create and insert
 1452            sql_create_table_columns = []
 1453            sql_create_table_columns_list = []
 1454            for column in structure_complete:
 1455                column_type = structure_complete[column]
 1456                sql_create_table_columns.append(
 1457                    f'"{column}" {column_type} default NULL'
 1458                )
 1459                sql_create_table_columns_list.append(f'"{column}"')
 1460
 1461            # Create database
 1462            log.debug(f"Create Table {table_variants}")
 1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1466            self.conn.execute(sql_create_table)
 1467
 1468            # chunksize define length of file chunk load file
 1469            chunksize = 100000
 1470
 1471            # delimiter
 1472            delimiter = file_format_delimiters.get(input_format, "\t")
 1473
 1474            # Load the input file
 1475            with open(self.input, "rt") as input_file:
 1476
 1477                # Use the appropriate file handler based on the input format
 1478                if input_compressed:
 1479                    input_file = bgzf.open(self.input, "rt")
 1480                if input_format in ["vcf"]:
 1481                    header_len = self.get_header_length()
 1482                else:
 1483                    header_len = 0
 1484
 1485                # Insert the file contents into a table
 1486                self.insert_file_to_table(
 1487                    input_file,
 1488                    columns=sql_create_table_columns_list_sql,
 1489                    header_len=header_len,
 1490                    sep=delimiter,
 1491                    chunksize=chunksize,
 1492                )
 1493
 1494        else:
 1495            log.error(
 1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1497            )
 1498            raise ValueError(
 1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1500            )
 1501
 1502        # Explode INFOS fields into table fields
 1503        if self.get_explode_infos():
 1504            self.explode_infos(
 1505                prefix=self.get_explode_infos_prefix(),
 1506                fields=self.get_explode_infos_fields(),
 1507                force=True,
 1508            )
 1509
 1510        # Create index after insertion
 1511        self.create_indexes()
 1512
 1513    def get_explode_infos(self) -> bool:
 1514        """
 1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1516        to False if it is not set.
 1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1518        value. If the parameter is not present, it will return False.
 1519        """
 1520
 1521        return self.get_param().get("explode", {}).get("explode_infos", False)
 1522
 1523    def get_explode_infos_fields(
 1524        self,
 1525        explode_infos_fields: str = None,
 1526        remove_fields_not_in_header: bool = False,
 1527    ) -> list:
 1528        """
 1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1530        the input parameter `explode_infos_fields`.
 1531
 1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1534        comma-separated list of field names to explode
 1535        :type explode_infos_fields: str
 1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1537        flag that determines whether to remove fields that are not present in the header. If it is set
 1538        to `True`, any field that is not in the header will be excluded from the list of exploded
 1539        information fields. If it is set to `, defaults to False
 1540        :type remove_fields_not_in_header: bool (optional)
 1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1545        splitting the string by commas.
 1546        """
 1547
 1548        # If no fields, get it in param
 1549        if not explode_infos_fields:
 1550            explode_infos_fields = (
 1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1552            )
 1553
 1554        # If no fields, defined as all fields in header using keyword
 1555        if not explode_infos_fields:
 1556            explode_infos_fields = "*"
 1557
 1558        # If fields list not empty
 1559        if explode_infos_fields:
 1560
 1561            # Input fields list
 1562            if isinstance(explode_infos_fields, str):
 1563                fields_input = explode_infos_fields.split(",")
 1564            elif isinstance(explode_infos_fields, list):
 1565                fields_input = explode_infos_fields
 1566            else:
 1567                fields_input = []
 1568
 1569            # Fields list without * keyword
 1570            fields_without_all = fields_input.copy()
 1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1572                fields_without_all.remove("*")
 1573
 1574            # Fields in header
 1575            fields_in_header = sorted(list(set(self.get_header().infos)))
 1576
 1577            # Construct list of fields
 1578            fields_output = []
 1579            for field in fields_input:
 1580
 1581                # Strip field
 1582                field = field.strip()
 1583
 1584                # format keyword * in regex
 1585                if field.upper() in ["*"]:
 1586                    field = ".*"
 1587
 1588                # Find all fields with pattern
 1589                r = re.compile(field)
 1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1591
 1592                # Remove fields input from search
 1593                if field in fields_search:
 1594                    fields_search = [field]
 1595                elif fields_search != [field]:
 1596                    fields_search = sorted(
 1597                        list(set(fields_search).difference(fields_input))
 1598                    )
 1599
 1600                # If field is not in header (avoid not well formatted header)
 1601                if not fields_search and not remove_fields_not_in_header:
 1602                    fields_search = [field]
 1603
 1604                # Add found fields
 1605                for new_field in fields_search:
 1606                    # Add field, if not already exists, and if it is in header (if asked)
 1607                    if (
 1608                        new_field not in fields_output
 1609                        and (
 1610                            not remove_fields_not_in_header
 1611                            or new_field in fields_in_header
 1612                        )
 1613                        and new_field not in [".*"]
 1614                    ):
 1615                        fields_output.append(new_field)
 1616
 1617            return fields_output
 1618
 1619        else:
 1620
 1621            return []
 1622
 1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1624        """
 1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1627        not provided.
 1628
 1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1630        prefix to be used for exploding or expanding information
 1631        :type explode_infos_prefix: str
 1632        :return: the value of the variable `explode_infos_prefix`.
 1633        """
 1634
 1635        if not explode_infos_prefix:
 1636            explode_infos_prefix = (
 1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1638            )
 1639
 1640        return explode_infos_prefix
 1641
 1642    def add_column(
 1643        self,
 1644        table_name,
 1645        column_name,
 1646        column_type,
 1647        default_value=None,
 1648        drop: bool = False,
 1649    ) -> dict:
 1650        """
 1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1652        doesn't already exist.
 1653
 1654        :param table_name: The name of the table to which you want to add a column
 1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1656        to the table
 1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1658        want to add to the table. It should be a string that represents the desired data type, such as
 1659        "INTEGER", "TEXT", "REAL", etc
 1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1661        default value for the newly added column. If a default value is provided, it will be assigned to
 1662        the column for any existing rows that do not have a value for that column
 1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1666        to False
 1667        :type drop: bool (optional)
 1668        :return: a boolean value indicating whether the column was successfully added to the table.
 1669        """
 1670
 1671        # added
 1672        added = False
 1673        dropped = False
 1674
 1675        # Check if the column already exists in the table
 1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1677        columns = self.get_query_to_df(query).columns.tolist()
 1678        if column_name.upper() in [c.upper() for c in columns]:
 1679            log.debug(
 1680                f"The {column_name} column already exists in the {table_name} table"
 1681            )
 1682            if drop:
 1683                self.drop_column(table_name=table_name, column_name=column_name)
 1684                dropped = True
 1685            else:
 1686                return None
 1687        else:
 1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1689
 1690        # Add column in table
 1691        add_column_query = (
 1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1693        )
 1694        if default_value is not None:
 1695            add_column_query += f" DEFAULT {default_value}"
 1696        self.execute_query(add_column_query)
 1697        added = not dropped
 1698        log.debug(
 1699            f"The {column_name} column was successfully added to the {table_name} table"
 1700        )
 1701
 1702        if added:
 1703            added_column = {
 1704                "table_name": table_name,
 1705                "column_name": column_name,
 1706                "column_type": column_type,
 1707                "default_value": default_value,
 1708            }
 1709        else:
 1710            added_column = None
 1711
 1712        return added_column
 1713
 1714    def drop_column(
 1715        self, column: dict = None, table_name: str = None, column_name: str = None
 1716    ) -> bool:
 1717        """
 1718        The `drop_column` function drops a specified column from a given table in a database and returns
 1719        True if the column was successfully dropped, and False if the column does not exist in the
 1720        table.
 1721
 1722        :param column: The `column` parameter is a dictionary that contains information about the column
 1723        you want to drop. It has two keys:
 1724        :type column: dict
 1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1726        drop a column
 1727        :type table_name: str
 1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1729        from the table
 1730        :type column_name: str
 1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1732        and False if the column does not exist in the table.
 1733        """
 1734
 1735        # Find column infos
 1736        if column:
 1737            if isinstance(column, dict):
 1738                table_name = column.get("table_name", None)
 1739                column_name = column.get("column_name", None)
 1740            elif isinstance(column, str):
 1741                table_name = self.get_table_variants()
 1742                column_name = column
 1743            else:
 1744                table_name = None
 1745                column_name = None
 1746
 1747        if not table_name and not column_name:
 1748            return False
 1749
 1750        # Removed
 1751        removed = False
 1752
 1753        # Check if the column already exists in the table
 1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1755        columns = self.get_query_to_df(query).columns.tolist()
 1756        if column_name in columns:
 1757            log.debug(f"The {column_name} column exists in the {table_name} table")
 1758        else:
 1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1760            return False
 1761
 1762        # Add column in table # ALTER TABLE integers DROP k
 1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1764        self.execute_query(add_column_query)
 1765        removed = True
 1766        log.debug(
 1767            f"The {column_name} column was successfully dropped to the {table_name} table"
 1768        )
 1769
 1770        return removed
 1771
 1772    def explode_infos(
 1773        self,
 1774        prefix: str = None,
 1775        create_index: bool = False,
 1776        fields: list = None,
 1777        force: bool = False,
 1778        proccess_all_fields_together: bool = False,
 1779        table: str = None,
 1780    ) -> list:
 1781        """
 1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1783        individual columns, returning a list of added columns.
 1784
 1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1787        `self.get_explode_infos_prefix()` as the prefix
 1788        :type prefix: str
 1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1791        `False`, indexes will not be created. The default value is `False`, defaults to False
 1792        :type create_index: bool (optional)
 1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1796        a list to the `
 1797        :type fields: list
 1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1801        defaults to False
 1802        :type force: bool (optional)
 1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1804        flag that determines whether to process all the INFO fields together or individually. If set to
 1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1806        be processed individually. The default value is, defaults to False
 1807        :type proccess_all_fields_together: bool (optional)
 1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1810        a value for the `table` parameter, the function will use that table name. If the `table`
 1811        parameter is
 1812        :type table: str
 1813        :return: The `explode_infos` function returns a list of added columns.
 1814        """
 1815
 1816        # drop indexes
 1817        self.drop_indexes()
 1818
 1819        # connexion format
 1820        connexion_format = self.get_connexion_format()
 1821
 1822        # Access
 1823        access = self.get_config().get("access", None)
 1824
 1825        # Added columns
 1826        added_columns = []
 1827
 1828        if access not in ["RO"]:
 1829
 1830            # prefix
 1831            if prefix in [None, True] or not isinstance(prefix, str):
 1832                if self.get_explode_infos_prefix() not in [None, True]:
 1833                    prefix = self.get_explode_infos_prefix()
 1834                else:
 1835                    prefix = "INFO/"
 1836
 1837            # table variants
 1838            if table is not None:
 1839                table_variants = table
 1840            else:
 1841                table_variants = self.get_table_variants(clause="select")
 1842
 1843            # extra infos
 1844            try:
 1845                extra_infos = self.get_extra_infos()
 1846            except:
 1847                extra_infos = []
 1848
 1849            # Header infos
 1850            header_infos = self.get_header().infos
 1851
 1852            log.debug(
 1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1854            )
 1855
 1856            sql_info_alter_table_array = []
 1857
 1858            # Info fields to check
 1859            fields_list = list(header_infos)
 1860            if fields:
 1861                fields_list += fields
 1862            fields_list = set(fields_list)
 1863
 1864            # If no fields
 1865            if not fields:
 1866                fields = []
 1867
 1868            # Translate fields if patterns
 1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1870
 1871            for info in fields:
 1872
 1873                info_id_sql = prefix + info
 1874
 1875                if (
 1876                    info in fields_list
 1877                    or prefix + info in fields_list
 1878                    or info in extra_infos
 1879                ):
 1880
 1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1882
 1883                    if info in header_infos:
 1884                        info_type = header_infos[info].type
 1885                        info_num = header_infos[info].num
 1886                    else:
 1887                        info_type = "String"
 1888                        info_num = 0
 1889
 1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1891                    if info_num != 1:
 1892                        type_sql = "VARCHAR"
 1893
 1894                    # Add field
 1895                    added_column = self.add_column(
 1896                        table_name=table_variants,
 1897                        column_name=info_id_sql,
 1898                        column_type=type_sql,
 1899                        default_value="null",
 1900                        drop=force,
 1901                    )
 1902
 1903                    if added_column:
 1904                        added_columns.append(added_column)
 1905
 1906                    if added_column or force:
 1907
 1908                        # add field to index
 1909                        self.index_additionnal_fields.append(info_id_sql)
 1910
 1911                        # Update field array
 1912                        if connexion_format in ["duckdb"]:
 1913                            update_info_field = f"""
 1914                            "{info_id_sql}" =
 1915                                CASE
 1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1918                                END
 1919                            """
 1920                        elif connexion_format in ["sqlite"]:
 1921                            update_info_field = f"""
 1922                                "{info_id_sql}" =
 1923                                    CASE
 1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1927                                    END
 1928                            """
 1929
 1930                        sql_info_alter_table_array.append(update_info_field)
 1931
 1932            if sql_info_alter_table_array:
 1933
 1934                # By chromosomes
 1935                try:
 1936                    chromosomes_list = list(
 1937                        self.get_query_to_df(
 1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1939                        )["#CHROM"]
 1940                    )
 1941                except:
 1942                    chromosomes_list = [None]
 1943
 1944                for chrom in chromosomes_list:
 1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1946
 1947                    # Where clause
 1948                    where_clause = ""
 1949                    if chrom and len(chromosomes_list) > 1:
 1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1951
 1952                    # Update table
 1953                    if proccess_all_fields_together:
 1954                        sql_info_alter_table_array_join = ", ".join(
 1955                            sql_info_alter_table_array
 1956                        )
 1957                        if sql_info_alter_table_array_join:
 1958                            sql_info_alter_table = f"""
 1959                                UPDATE {table_variants}
 1960                                SET {sql_info_alter_table_array_join}
 1961                                {where_clause}
 1962                                """
 1963                            log.debug(
 1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1965                            )
 1966                            # log.debug(sql_info_alter_table)
 1967                            self.conn.execute(sql_info_alter_table)
 1968                    else:
 1969                        sql_info_alter_num = 0
 1970                        for sql_info_alter in sql_info_alter_table_array:
 1971                            sql_info_alter_num += 1
 1972                            sql_info_alter_table = f"""
 1973                                UPDATE {table_variants}
 1974                                SET {sql_info_alter}
 1975                                {where_clause}
 1976                                """
 1977                            log.debug(
 1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1979                            )
 1980                            # log.debug(sql_info_alter_table)
 1981                            self.conn.execute(sql_info_alter_table)
 1982
 1983        # create indexes
 1984        if create_index:
 1985            self.create_indexes()
 1986
 1987        return added_columns
 1988
 1989    def create_indexes(self) -> None:
 1990        """
 1991        Create indexes on the table after insertion
 1992        """
 1993
 1994        # Access
 1995        access = self.get_config().get("access", None)
 1996
 1997        # get table variants
 1998        table_variants = self.get_table_variants("FROM")
 1999
 2000        if self.get_indexing() and access not in ["RO"]:
 2001            # Create index
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2009            self.conn.execute(sql_create_table_index)
 2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2011            self.conn.execute(sql_create_table_index)
 2012            for field in self.index_additionnal_fields:
 2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2014                self.conn.execute(sql_create_table_index)
 2015
 2016    def drop_indexes(self) -> None:
 2017        """
 2018        Create indexes on the table after insertion
 2019        """
 2020
 2021        # Access
 2022        access = self.get_config().get("access", None)
 2023
 2024        # get table variants
 2025        table_variants = self.get_table_variants("FROM")
 2026
 2027        # Get database format
 2028        connexion_format = self.get_connexion_format()
 2029
 2030        if access not in ["RO"]:
 2031            if connexion_format in ["duckdb"]:
 2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2033            elif connexion_format in ["sqlite"]:
 2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2035
 2036            list_indexes = self.conn.execute(sql_list_indexes)
 2037            index_names = [row[0] for row in list_indexes.fetchall()]
 2038            for index in index_names:
 2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2040                self.conn.execute(sql_drop_table_index)
 2041
 2042    def read_vcf_header(self, f) -> list:
 2043        """
 2044        It reads the header of a VCF file and returns a list of the header lines
 2045
 2046        :param f: the file object
 2047        :return: The header lines of the VCF file.
 2048        """
 2049
 2050        header_list = []
 2051        for line in f:
 2052            header_list.append(line)
 2053            if line.startswith("#CHROM"):
 2054                break
 2055        return header_list
 2056
 2057    def read_vcf_header_file(self, file: str = None) -> list:
 2058        """
 2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2060        uncompressed files.
 2061
 2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2064        default to `None`
 2065        :type file: str
 2066        :return: The function `read_vcf_header_file` returns a list.
 2067        """
 2068
 2069        if self.get_input_compressed(input_file=file):
 2070            with bgzf.open(file, "rt") as f:
 2071                return self.read_vcf_header(f=f)
 2072        else:
 2073            with open(file, "rt") as f:
 2074                return self.read_vcf_header(f=f)
 2075
 2076    def execute_query(self, query: str):
 2077        """
 2078        It takes a query as an argument, executes it, and returns the results
 2079
 2080        :param query: The query to be executed
 2081        :return: The result of the query is being returned.
 2082        """
 2083        if query:
 2084            return self.conn.execute(query)  # .fetchall()
 2085        else:
 2086            return None
 2087
 2088    def export_output(
 2089        self,
 2090        output_file: str | None = None,
 2091        output_header: str | None = None,
 2092        export_header: bool = True,
 2093        query: str | None = None,
 2094        parquet_partitions: list | None = None,
 2095        chunk_size: int | None = None,
 2096        threads: int | None = None,
 2097        sort: bool = False,
 2098        index: bool = False,
 2099        order_by: str | None = None,
 2100        fields_to_rename: dict | None = None
 2101    ) -> bool:
 2102        """
 2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2105        partitioning.
 2106        
 2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2108        output file where the exported data will be saved
 2109        :type output_file: str | None
 2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2112        header will be exported to a file with the same name as the `output_file` parameter, but with
 2113        the extension "
 2114        :type output_header: str | None
 2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2117        True, the header will be exported to a file. If `export_header` is False, the header will not
 2118        be, defaults to True
 2119        :type export_header: bool (optional)
 2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2121        that can be used to filter and select specific data from the VCF file before exporting it. If
 2122        provided, only the data that matches the query will be exported. This allows you to customize
 2123        the exported data based on
 2124        :type query: str | None
 2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2127        organize data in a hierarchical directory structure based on the values of one or more columns.
 2128        This can improve query performance when working with large datasets
 2129        :type parquet_partitions: list | None
 2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2132        multiple files. It helps in optimizing the export process by breaking down the data into
 2133        manageable chunks for processing and storage
 2134        :type chunk_size: int | None
 2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2136        threads to be used during the export process. It determines the level of parallelism and can
 2137        improve the performance of the export operation. If this parameter is not provided, the function
 2138        will use the default number of threads
 2139        :type threads: int | None
 2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2141        determines whether the output file should be sorted based on genomic coordinates of the
 2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2143        `False`,, defaults to False
 2144        :type sort: bool (optional)
 2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2146        determines whether an index should be created on the output file. If `index` is set to `True`,
 2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2148        :type index: bool (optional)
 2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2152        output file should be
 2153        :type order_by: str | None
 2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2155        mapping of field names to be renamed during the export process. This parameter allows you to
 2156        customize the output field names before exporting the data. Each key-value pair in the
 2157        dictionary represents the original field name as the key and the new field name
 2158        :type fields_to_rename: dict | None
 2159        :return: The `export_output` function returns a boolean value. It checks if the output file
 2160        exists and returns True if it does, or None if it doesn't.
 2161        """
 2162
 2163        # Log
 2164        log.info("Exporting...")
 2165
 2166        # Full path
 2167        output_file = full_path(output_file)
 2168        output_header = full_path(output_header)
 2169
 2170        # Config
 2171        config = self.get_config()
 2172
 2173        # Param
 2174        param = self.get_param()
 2175
 2176        # Tmp files to remove
 2177        tmp_to_remove = []
 2178
 2179        # If no output, get it
 2180        if not output_file:
 2181            output_file = self.get_output()
 2182
 2183        # If not threads
 2184        if not threads:
 2185            threads = self.get_threads()
 2186
 2187        # Rename fields
 2188        if not fields_to_rename:
 2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2191
 2192        # Auto header name with extension
 2193        if export_header or output_header:
 2194            if not output_header:
 2195                output_header = f"{output_file}.hdr"
 2196            # Export header
 2197            self.export_header(output_file=output_file)
 2198
 2199        # Switch off export header if VCF output
 2200        output_file_type = get_file_format(output_file)
 2201        if output_file_type in ["vcf"]:
 2202            export_header = False
 2203            tmp_to_remove.append(output_header)
 2204
 2205        # Chunk size
 2206        if not chunk_size:
 2207            chunk_size = config.get("chunk_size", None)
 2208
 2209        # Parquet partition
 2210        if not parquet_partitions:
 2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2212        if parquet_partitions and isinstance(parquet_partitions, str):
 2213            parquet_partitions = parquet_partitions.split(",")
 2214
 2215        # Order by
 2216        if not order_by:
 2217            order_by = param.get("export", {}).get("order_by", "")
 2218
 2219        # Header in output
 2220        header_in_output = param.get("export", {}).get("include_header", False)
 2221
 2222        # Database
 2223        database_source = self.get_connexion()
 2224
 2225        # Connexion format
 2226        connexion_format = self.get_connexion_format()
 2227
 2228        # Explode infos
 2229        if self.get_explode_infos():
 2230            self.explode_infos(
 2231                prefix=self.get_explode_infos_prefix(),
 2232                fields=self.get_explode_infos_fields(),
 2233                force=False,
 2234            )
 2235
 2236        # if connexion_format in ["sqlite"] or query:
 2237        if connexion_format in ["sqlite"]:
 2238
 2239            # Export in Parquet
 2240            random_tmp = "".join(
 2241                random.choice(string.ascii_lowercase) for i in range(10)
 2242            )
 2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2244            tmp_to_remove.append(database_source)
 2245
 2246            # Table Variants
 2247            table_variants = self.get_table_variants()
 2248
 2249            # Create export query
 2250            sql_query_export_subquery = f"""
 2251                SELECT * FROM {table_variants}
 2252                """
 2253
 2254            # Write source file
 2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2256
 2257        # Create database
 2258        database = Database(
 2259            database=database_source,
 2260            table="variants",
 2261            header_file=output_header,
 2262            conn_config=self.get_connexion_config(),
 2263        )
 2264
 2265        # Existing colomns header
 2266        existing_columns_header = database.get_header_columns_from_database(query=query)
 2267
 2268        # Sample list
 2269        if output_file_type in ["vcf"]:
 2270            get_samples = self.get_samples()
 2271            get_samples_check = self.get_samples_check()
 2272            samples_force = get_samples is not None
 2273            sample_list = self.get_header_sample_list(
 2274                check=get_samples_check,
 2275                samples=get_samples,
 2276                samples_force=samples_force,
 2277            )
 2278        else:
 2279            sample_list = None
 2280
 2281        # Export file
 2282        database.export(
 2283            output_database=output_file,
 2284            output_header=output_header,
 2285            existing_columns_header=existing_columns_header,
 2286            parquet_partitions=parquet_partitions,
 2287            chunk_size=chunk_size,
 2288            threads=threads,
 2289            sort=sort,
 2290            index=index,
 2291            header_in_output=header_in_output,
 2292            order_by=order_by,
 2293            query=query,
 2294            export_header=export_header,
 2295            sample_list=sample_list,
 2296        )
 2297
 2298        # Remove
 2299        remove_if_exists(tmp_to_remove)
 2300
 2301        return (os.path.exists(output_file) or None) and (
 2302            os.path.exists(output_file) or None
 2303        )
 2304
 2305    def get_extra_infos(self, table: str = None) -> list:
 2306        """
 2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2308        in the header.
 2309
 2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2311        name of the table from which you want to retrieve the extra columns that are not present in the
 2312        header. If the `table` parameter is not provided when calling the function, it will default to
 2313        using the variants
 2314        :type table: str
 2315        :return: A list of columns that are in the specified table but not in the header of the table.
 2316        """
 2317
 2318        header_columns = []
 2319
 2320        if not table:
 2321            table = self.get_table_variants(clause="from")
 2322            header_columns = self.get_header_columns()
 2323
 2324        # Check all columns in the database
 2325        query = f""" SELECT * FROM {table} LIMIT 1 """
 2326        log.debug(f"query {query}")
 2327        table_columns = self.get_query_to_df(query).columns.tolist()
 2328        extra_columns = []
 2329
 2330        # Construct extra infos (not in header)
 2331        for column in table_columns:
 2332            if column not in header_columns:
 2333                extra_columns.append(column)
 2334
 2335        return extra_columns
 2336
 2337    def get_extra_infos_sql(self, table: str = None) -> str:
 2338        """
 2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2340        by double quotes
 2341
 2342        :param table: The name of the table to get the extra infos from. If None, the default table is
 2343        used
 2344        :type table: str
 2345        :return: A string of the extra infos
 2346        """
 2347
 2348        return ", ".join(
 2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2350        )
 2351
 2352    def export_header(
 2353        self,
 2354        header_name: str = None,
 2355        output_file: str = None,
 2356        output_file_ext: str = ".hdr",
 2357        clean_header: bool = True,
 2358        remove_chrom_line: bool = False,
 2359    ) -> str:
 2360        """
 2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2362        specified options, and writes it to a new file.
 2363
 2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2365        this parameter is not specified, the header will be written to the output file
 2366        :type header_name: str
 2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2368        specify the name of the output file where the header will be written. If this parameter is not
 2369        provided, the header will be written to a temporary file
 2370        :type output_file: str
 2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2373        if not specified by the user. This extension will be appended to the `output_file` name to
 2374        create the final, defaults to .hdr
 2375        :type output_file_ext: str (optional)
 2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2378        `True`, the function will clean the header by modifying certain lines based on a specific
 2379        pattern. If `clean_header`, defaults to True
 2380        :type clean_header: bool (optional)
 2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2382        boolean flag that determines whether the #CHROM line should be removed from the header before
 2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2384        defaults to False
 2385        :type remove_chrom_line: bool (optional)
 2386        :return: The function `export_header` returns the name of the temporary header file that is
 2387        created.
 2388        """
 2389
 2390        if not header_name and not output_file:
 2391            output_file = self.get_output()
 2392
 2393        if self.get_header():
 2394
 2395            # Get header object
 2396            header_obj = self.get_header()
 2397
 2398            # Create database
 2399            db_for_header = Database(database=self.get_input())
 2400
 2401            # Get real columns in the file
 2402            db_header_columns = db_for_header.get_columns()
 2403
 2404            with tempfile.TemporaryDirectory() as tmpdir:
 2405
 2406                # Write header file
 2407                header_file_tmp = os.path.join(tmpdir, "header")
 2408                f = open(header_file_tmp, "w")
 2409                vcf.Writer(f, header_obj)
 2410                f.close()
 2411
 2412                # Replace #CHROM line with rel columns
 2413                header_list = db_for_header.read_header_file(
 2414                    header_file=header_file_tmp
 2415                )
 2416                header_list[-1] = "\t".join(db_header_columns)
 2417
 2418                # Remove CHROM line
 2419                if remove_chrom_line:
 2420                    header_list.pop()
 2421
 2422                # Clean header
 2423                if clean_header:
 2424                    header_list_clean = []
 2425                    for head in header_list:
 2426                        # Clean head for malformed header
 2427                        head_clean = head
 2428                        head_clean = re.subn(
 2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2431                            head_clean,
 2432                            2,
 2433                        )[0]
 2434                        # Write header
 2435                        header_list_clean.append(head_clean)
 2436                    header_list = header_list_clean
 2437
 2438            tmp_header_name = output_file + output_file_ext
 2439
 2440            f = open(tmp_header_name, "w")
 2441            for line in header_list:
 2442                f.write(line)
 2443            f.close()
 2444
 2445        return tmp_header_name
 2446
 2447    def export_variant_vcf(
 2448        self,
 2449        vcf_file,
 2450        remove_info: bool = False,
 2451        add_samples: bool = True,
 2452        list_samples: list = [],
 2453        where_clause: str = "",
 2454        index: bool = False,
 2455        threads: int | None = None,
 2456    ) -> bool | None:
 2457        """
 2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2459        remove INFO field, add samples, and control compression and indexing.
 2460
 2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2462        written to. It is the output file that will contain the filtered VCF data based on the specified
 2463        parameters
 2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2467        in, defaults to False
 2468        :type remove_info: bool (optional)
 2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2471        If set to False, the samples will be removed. The default value is True, defaults to True
 2472        :type add_samples: bool (optional)
 2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2474        in the output VCF file. By default, all samples will be included. If you provide a list of
 2475        samples, only those samples will be included in the output file
 2476        :type list_samples: list
 2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2478        determines whether or not to create an index for the output VCF file. If `index` is set to
 2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2480        :type index: bool (optional)
 2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2483        will be used during the export process. More threads can potentially speed up the export process
 2484        by utilizing multiple cores of the processor. If
 2485        :type threads: int | None
 2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2487        method with various parameters including the output file, query, threads, sort flag, and index
 2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2489        specified parameters and configurations provided in the `export_variant_vcf` function.
 2490        """
 2491
 2492        # Config
 2493        config = self.get_config()
 2494
 2495        # Extract VCF
 2496        log.debug("Export VCF...")
 2497
 2498        # Table variants
 2499        table_variants = self.get_table_variants()
 2500
 2501        # Threads
 2502        if not threads:
 2503            threads = self.get_threads()
 2504
 2505        # Info fields
 2506        if remove_info:
 2507            if not isinstance(remove_info, str):
 2508                remove_info = "."
 2509            info_field = f"""'{remove_info}' as INFO"""
 2510        else:
 2511            info_field = "INFO"
 2512
 2513        # Samples fields
 2514        if add_samples:
 2515            if not list_samples:
 2516                list_samples = self.get_header_sample_list()
 2517            if list_samples:
 2518                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2519            else:
 2520                samples_fields = ""
 2521            log.debug(f"samples_fields: {samples_fields}")
 2522        else:
 2523            samples_fields = ""
 2524
 2525        # Where clause
 2526        if where_clause is None:
 2527            where_clause = ""
 2528
 2529        # Variants
 2530        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2531        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2532        log.debug(f"sql_query_select={sql_query_select}")
 2533
 2534        return self.export_output(
 2535            output_file=vcf_file,
 2536            output_header=None,
 2537            export_header=True,
 2538            query=sql_query_select,
 2539            parquet_partitions=None,
 2540            chunk_size=config.get("chunk_size", None),
 2541            threads=threads,
 2542            sort=True,
 2543            index=index,
 2544            order_by=None,
 2545        )
 2546
 2547    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2548        """
 2549        It takes a list of commands and runs them in parallel using the number of threads specified
 2550
 2551        :param commands: A list of commands to run
 2552        :param threads: The number of threads to use, defaults to 1 (optional)
 2553        """
 2554
 2555        run_parallel_commands(commands, threads)
 2556
 2557    def get_threads(self, default: int = 1) -> int:
 2558        """
 2559        This function returns the number of threads to use for a job, with a default value of 1 if not
 2560        specified.
 2561
 2562        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2563        default number of threads to use if no specific value is provided. If no value is provided for
 2564        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2565        used, defaults to 1
 2566        :type default: int (optional)
 2567        :return: the number of threads to use for the current job.
 2568        """
 2569
 2570        # Config
 2571        config = self.get_config()
 2572
 2573        # Param
 2574        param = self.get_param()
 2575
 2576        # Input threads
 2577        input_thread = param.get("threads", config.get("threads", None))
 2578
 2579        # Check threads
 2580        if not input_thread:
 2581            threads = default
 2582        elif int(input_thread) <= 0:
 2583            threads = os.cpu_count()
 2584        else:
 2585            threads = int(input_thread)
 2586        return threads
 2587
 2588    def get_memory(self, default: str = None) -> str:
 2589        """
 2590        This function retrieves the memory value from parameters or configuration with a default value
 2591        if not found.
 2592
 2593        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2594        default value is used as a fallback in case the `memory` parameter is not provided in the
 2595        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2596        the function
 2597        :type default: str
 2598        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2599        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2600        return the default value provided as an argument to the function.
 2601        """
 2602
 2603        # Config
 2604        config = self.get_config()
 2605
 2606        # Param
 2607        param = self.get_param()
 2608
 2609        # Input threads
 2610        input_memory = param.get("memory", config.get("memory", None))
 2611
 2612        # Check threads
 2613        if input_memory:
 2614            memory = input_memory
 2615        else:
 2616            memory = default
 2617
 2618        return memory
 2619
 2620    def update_from_vcf(self, vcf_file: str) -> None:
 2621        """
 2622        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2623
 2624        :param vcf_file: the path to the VCF file
 2625        """
 2626
 2627        connexion_format = self.get_connexion_format()
 2628
 2629        if connexion_format in ["duckdb"]:
 2630            self.update_from_vcf_duckdb(vcf_file)
 2631        elif connexion_format in ["sqlite"]:
 2632            self.update_from_vcf_sqlite(vcf_file)
 2633
 2634    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2635        """
 2636        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2637        INFO column of the VCF file
 2638
 2639        :param vcf_file: the path to the VCF file
 2640        """
 2641
 2642        # varaints table
 2643        table_variants = self.get_table_variants()
 2644
 2645        # Loading VCF into temporaire table
 2646        skip = self.get_header_length(file=vcf_file)
 2647        vcf_df = pd.read_csv(
 2648            vcf_file,
 2649            sep="\t",
 2650            engine="c",
 2651            skiprows=skip,
 2652            header=0,
 2653            low_memory=False,
 2654        )
 2655        sql_query_update = f"""
 2656        UPDATE {table_variants} as table_variants
 2657            SET INFO = concat(
 2658                            CASE
 2659                                WHEN INFO NOT IN ('', '.')
 2660                                THEN INFO
 2661                                ELSE ''
 2662                            END,
 2663                            (
 2664                                SELECT 
 2665                                    concat(
 2666                                        CASE
 2667                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2668                                            THEN ';'
 2669                                            ELSE ''
 2670                                        END
 2671                                        ,
 2672                                        CASE
 2673                                            WHEN table_parquet.INFO NOT IN ('','.')
 2674                                            THEN table_parquet.INFO
 2675                                            ELSE ''
 2676                                        END
 2677                                    )
 2678                                FROM vcf_df as table_parquet
 2679                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2680                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2681                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2682                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2683                                        AND table_parquet.INFO NOT IN ('','.')
 2684                            )
 2685                        )
 2686            ;
 2687            """
 2688        self.conn.execute(sql_query_update)
 2689
 2690    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2691        """
 2692        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2693        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2694        table
 2695
 2696        :param vcf_file: The path to the VCF file you want to update the database with
 2697        """
 2698
 2699        # Create a temporary table for the VCF
 2700        table_vcf = "tmp_vcf"
 2701        sql_create = (
 2702            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2703        )
 2704        self.conn.execute(sql_create)
 2705
 2706        # Loading VCF into temporaire table
 2707        vcf_df = pd.read_csv(
 2708            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2709        )
 2710        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2711        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2712
 2713        # Update table 'variants' with VCF data
 2714        # warning: CONCAT as || operator
 2715        sql_query_update = f"""
 2716            UPDATE variants as table_variants
 2717            SET INFO = CASE
 2718                            WHEN INFO NOT IN ('', '.')
 2719                            THEN INFO
 2720                            ELSE ''
 2721                        END ||
 2722                        (
 2723                        SELECT 
 2724                            CASE 
 2725                                WHEN table_variants.INFO NOT IN ('','.') 
 2726                                    AND table_vcf.INFO NOT IN ('','.')  
 2727                                THEN ';' 
 2728                                ELSE '' 
 2729                            END || 
 2730                            CASE 
 2731                                WHEN table_vcf.INFO NOT IN ('','.') 
 2732                                THEN table_vcf.INFO 
 2733                                ELSE '' 
 2734                            END
 2735                        FROM {table_vcf} as table_vcf
 2736                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2737                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2738                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2739                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2740                        )
 2741        """
 2742        self.conn.execute(sql_query_update)
 2743
 2744        # Drop temporary table
 2745        sql_drop = f"DROP TABLE {table_vcf}"
 2746        self.conn.execute(sql_drop)
 2747
 2748    def drop_variants_table(self) -> None:
 2749        """
 2750        > This function drops the variants table
 2751        """
 2752
 2753        table_variants = self.get_table_variants()
 2754        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2755        self.conn.execute(sql_table_variants)
 2756
 2757    def set_variant_id(
 2758        self, variant_id_column: str = "variant_id", force: bool = None
 2759    ) -> str:
 2760        """
 2761        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2762        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2763
 2764        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2765        to variant_id
 2766        :type variant_id_column: str (optional)
 2767        :param force: If True, the variant_id column will be created even if it already exists
 2768        :type force: bool
 2769        :return: The name of the column that contains the variant_id
 2770        """
 2771
 2772        # Assembly
 2773        assembly = self.get_param().get(
 2774            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2775        )
 2776
 2777        # INFO/Tag prefix
 2778        prefix = self.get_explode_infos_prefix()
 2779
 2780        # Explode INFO/SVTYPE
 2781        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2782
 2783        # variants table
 2784        table_variants = self.get_table_variants()
 2785
 2786        # variant_id column
 2787        if not variant_id_column:
 2788            variant_id_column = "variant_id"
 2789
 2790        # Creta variant_id column
 2791        if "variant_id" not in self.get_extra_infos() or force:
 2792
 2793            # Create column
 2794            self.add_column(
 2795                table_name=table_variants,
 2796                column_name=variant_id_column,
 2797                column_type="UBIGINT",
 2798                default_value="0",
 2799            )
 2800
 2801            # Update column
 2802            self.conn.execute(
 2803                f"""
 2804                    UPDATE {table_variants}
 2805                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2806                """
 2807            )
 2808
 2809        # Remove added columns
 2810        for added_column in added_columns:
 2811            self.drop_column(column=added_column)
 2812
 2813        # return variant_id column name
 2814        return variant_id_column
 2815
 2816    def get_variant_id_column(
 2817        self, variant_id_column: str = "variant_id", force: bool = None
 2818    ) -> str:
 2819        """
 2820        This function returns the variant_id column name
 2821
 2822        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2823        defaults to variant_id
 2824        :type variant_id_column: str (optional)
 2825        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2826        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2827        if it is not already set, or if it is set
 2828        :type force: bool
 2829        :return: The variant_id column name.
 2830        """
 2831
 2832        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2833
 2834    ###
 2835    # Annotation
 2836    ###
 2837
 2838    def scan_databases(
 2839        self,
 2840        database_formats: list = ["parquet"],
 2841        database_releases: list = ["current"],
 2842    ) -> dict:
 2843        """
 2844        The function `scan_databases` scans for available databases based on specified formats and
 2845        releases.
 2846
 2847        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2848        of the databases to be scanned. In this case, the accepted format is "parquet"
 2849        :type database_formats: list ["parquet"]
 2850        :param database_releases: The `database_releases` parameter is a list that specifies the
 2851        releases of the databases to be scanned. In the provided function, the default value for
 2852        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2853        databases that are in the "current"
 2854        :type database_releases: list
 2855        :return: The function `scan_databases` returns a dictionary containing information about
 2856        databases that match the specified formats and releases.
 2857        """
 2858
 2859        # Config
 2860        config = self.get_config()
 2861
 2862        # Param
 2863        param = self.get_param()
 2864
 2865        # Param - Assembly
 2866        assembly = param.get("assembly", config.get("assembly", None))
 2867        if not assembly:
 2868            assembly = DEFAULT_ASSEMBLY
 2869            log.warning(f"Default assembly '{assembly}'")
 2870
 2871        # Scan for availabled databases
 2872        log.info(
 2873            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2874        )
 2875        databases_infos_dict = databases_infos(
 2876            database_folder_releases=database_releases,
 2877            database_formats=database_formats,
 2878            assembly=assembly,
 2879            config=config,
 2880        )
 2881        log.info(
 2882            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2883        )
 2884
 2885        return databases_infos_dict
 2886
 2887    def annotation(self) -> None:
 2888        """
 2889        It annotates the VCF file with the annotations specified in the config file.
 2890        """
 2891
 2892        # Config
 2893        config = self.get_config()
 2894
 2895        # Param
 2896        param = self.get_param()
 2897
 2898        # Param - Assembly
 2899        assembly = param.get("assembly", config.get("assembly", None))
 2900        if not assembly:
 2901            assembly = DEFAULT_ASSEMBLY
 2902            log.warning(f"Default assembly '{assembly}'")
 2903
 2904        # annotations databases folders
 2905        annotations_databases = set(
 2906            config.get("folders", {})
 2907            .get("databases", {})
 2908            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2909            + config.get("folders", {})
 2910            .get("databases", {})
 2911            .get("parquet", ["~/howard/databases/parquet/current"])
 2912            + config.get("folders", {})
 2913            .get("databases", {})
 2914            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2915        )
 2916
 2917        # Get param annotations
 2918        if param.get("annotations", None) and isinstance(
 2919            param.get("annotations", None), str
 2920        ):
 2921            log.debug(param.get("annotations", None))
 2922            param_annotation_list = param.get("annotations").split(",")
 2923        else:
 2924            param_annotation_list = []
 2925
 2926        # Each tools param
 2927        if param.get("annotation_parquet", None) != None:
 2928            log.debug(
 2929                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2930            )
 2931            if isinstance(param.get("annotation_parquet", None), list):
 2932                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2933            else:
 2934                param_annotation_list.append(param.get("annotation_parquet"))
 2935        if param.get("annotation_snpsift", None) != None:
 2936            if isinstance(param.get("annotation_snpsift", None), list):
 2937                param_annotation_list.append(
 2938                    "snpsift:"
 2939                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2940                )
 2941            else:
 2942                param_annotation_list.append(
 2943                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2944                )
 2945        if param.get("annotation_snpeff", None) != None:
 2946            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2947        if param.get("annotation_bcftools", None) != None:
 2948            if isinstance(param.get("annotation_bcftools", None), list):
 2949                param_annotation_list.append(
 2950                    "bcftools:"
 2951                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2952                )
 2953            else:
 2954                param_annotation_list.append(
 2955                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2956                )
 2957        if param.get("annotation_annovar", None) != None:
 2958            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2959        if param.get("annotation_exomiser", None) != None:
 2960            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2961        if param.get("annotation_splice", None) != None:
 2962            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2963
 2964        # Merge param annotations list
 2965        param["annotations"] = ",".join(param_annotation_list)
 2966
 2967        # debug
 2968        log.debug(f"param_annotations={param['annotations']}")
 2969
 2970        if param.get("annotations"):
 2971
 2972            # Log
 2973            # log.info("Annotations - Check annotation parameters")
 2974
 2975            if not "annotation" in param:
 2976                param["annotation"] = {}
 2977
 2978            # List of annotations parameters
 2979            annotations_list_input = {}
 2980            if isinstance(param.get("annotations", None), str):
 2981                annotation_file_list = [
 2982                    value for value in param.get("annotations", "").split(",")
 2983                ]
 2984                for annotation_file in annotation_file_list:
 2985                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2986            else:
 2987                annotations_list_input = param.get("annotations", {})
 2988
 2989            log.info(f"Quick Annotations:")
 2990            for annotation_key in list(annotations_list_input.keys()):
 2991                log.info(f"   {annotation_key}")
 2992
 2993            # List of annotations and associated fields
 2994            annotations_list = {}
 2995
 2996            for annotation_file in annotations_list_input:
 2997
 2998                # Explode annotations if ALL
 2999                if (
 3000                    annotation_file.upper() == "ALL"
 3001                    or annotation_file.upper().startswith("ALL:")
 3002                ):
 3003
 3004                    # check ALL parameters (formats, releases)
 3005                    annotation_file_split = annotation_file.split(":")
 3006                    database_formats = "parquet"
 3007                    database_releases = "current"
 3008                    for annotation_file_option in annotation_file_split[1:]:
 3009                        database_all_options_split = annotation_file_option.split("=")
 3010                        if database_all_options_split[0] == "format":
 3011                            database_formats = database_all_options_split[1].split("+")
 3012                        if database_all_options_split[0] == "release":
 3013                            database_releases = database_all_options_split[1].split("+")
 3014
 3015                    # Scan for availabled databases
 3016                    databases_infos_dict = self.scan_databases(
 3017                        database_formats=database_formats,
 3018                        database_releases=database_releases,
 3019                    )
 3020
 3021                    # Add found databases in annotation parameters
 3022                    for database_infos in databases_infos_dict.keys():
 3023                        annotations_list[database_infos] = {"INFO": None}
 3024
 3025                else:
 3026                    annotations_list[annotation_file] = annotations_list_input[
 3027                        annotation_file
 3028                    ]
 3029
 3030            # Check each databases
 3031            if len(annotations_list):
 3032
 3033                log.info(
 3034                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3035                )
 3036
 3037                for annotation_file in annotations_list:
 3038
 3039                    # Init
 3040                    annotations = annotations_list.get(annotation_file, None)
 3041
 3042                    # Annotation snpEff
 3043                    if annotation_file.startswith("snpeff"):
 3044
 3045                        log.debug(f"Quick Annotation snpEff")
 3046
 3047                        if "snpeff" not in param["annotation"]:
 3048                            param["annotation"]["snpeff"] = {}
 3049
 3050                        if "options" not in param["annotation"]["snpeff"]:
 3051                            param["annotation"]["snpeff"]["options"] = ""
 3052
 3053                        # snpEff options in annotations
 3054                        param["annotation"]["snpeff"]["options"] = "".join(
 3055                            annotation_file.split(":")[1:]
 3056                        )
 3057
 3058                    # Annotation Annovar
 3059                    elif annotation_file.startswith("annovar"):
 3060
 3061                        log.debug(f"Quick Annotation Annovar")
 3062
 3063                        if "annovar" not in param["annotation"]:
 3064                            param["annotation"]["annovar"] = {}
 3065
 3066                        if "annotations" not in param["annotation"]["annovar"]:
 3067                            param["annotation"]["annovar"]["annotations"] = {}
 3068
 3069                        # Options
 3070                        annotation_file_split = annotation_file.split(":")
 3071                        for annotation_file_annotation in annotation_file_split[1:]:
 3072                            if annotation_file_annotation:
 3073                                param["annotation"]["annovar"]["annotations"][
 3074                                    annotation_file_annotation
 3075                                ] = annotations
 3076
 3077                    # Annotation Exomiser
 3078                    elif annotation_file.startswith("exomiser"):
 3079
 3080                        log.debug(f"Quick Annotation Exomiser")
 3081
 3082                        param["annotation"]["exomiser"] = params_string_to_dict(
 3083                            annotation_file
 3084                        )
 3085
 3086                    # Annotation Splice
 3087                    elif annotation_file.startswith("splice"):
 3088
 3089                        log.debug(f"Quick Annotation Splice")
 3090
 3091                        param["annotation"]["splice"] = params_string_to_dict(
 3092                            annotation_file
 3093                        )
 3094
 3095                    # Annotation Parquet or BCFTOOLS
 3096                    else:
 3097
 3098                        # Tools detection
 3099                        if annotation_file.startswith("bcftools:"):
 3100                            annotation_tool_initial = "bcftools"
 3101                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3102                        elif annotation_file.startswith("snpsift:"):
 3103                            annotation_tool_initial = "snpsift"
 3104                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3105                        elif annotation_file.startswith("bigwig:"):
 3106                            annotation_tool_initial = "bigwig"
 3107                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3108                        else:
 3109                            annotation_tool_initial = None
 3110
 3111                        # list of files
 3112                        annotation_file_list = annotation_file.replace("+", ":").split(
 3113                            ":"
 3114                        )
 3115
 3116                        for annotation_file in annotation_file_list:
 3117
 3118                            if annotation_file:
 3119
 3120                                # Annotation tool initial
 3121                                annotation_tool = annotation_tool_initial
 3122
 3123                                # Find file
 3124                                annotation_file_found = None
 3125
 3126                                if os.path.exists(annotation_file):
 3127                                    annotation_file_found = annotation_file
 3128                                elif os.path.exists(full_path(annotation_file)):
 3129                                    annotation_file_found = full_path(annotation_file)
 3130                                else:
 3131                                    # Find within assembly folders
 3132                                    for annotations_database in annotations_databases:
 3133                                        found_files = find_all(
 3134                                            annotation_file,
 3135                                            os.path.join(
 3136                                                annotations_database, assembly
 3137                                            ),
 3138                                        )
 3139                                        if len(found_files) > 0:
 3140                                            annotation_file_found = found_files[0]
 3141                                            break
 3142                                    if not annotation_file_found and not assembly:
 3143                                        # Find within folders
 3144                                        for (
 3145                                            annotations_database
 3146                                        ) in annotations_databases:
 3147                                            found_files = find_all(
 3148                                                annotation_file, annotations_database
 3149                                            )
 3150                                            if len(found_files) > 0:
 3151                                                annotation_file_found = found_files[0]
 3152                                                break
 3153                                log.debug(
 3154                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3155                                )
 3156
 3157                                # Full path
 3158                                annotation_file_found = full_path(annotation_file_found)
 3159
 3160                                if annotation_file_found:
 3161
 3162                                    database = Database(database=annotation_file_found)
 3163                                    quick_annotation_format = database.get_format()
 3164                                    quick_annotation_is_compressed = (
 3165                                        database.is_compressed()
 3166                                    )
 3167                                    quick_annotation_is_indexed = os.path.exists(
 3168                                        f"{annotation_file_found}.tbi"
 3169                                    )
 3170                                    bcftools_preference = False
 3171
 3172                                    # Check Annotation Tool
 3173                                    if not annotation_tool:
 3174                                        if (
 3175                                            bcftools_preference
 3176                                            and quick_annotation_format
 3177                                            in ["vcf", "bed"]
 3178                                            and quick_annotation_is_compressed
 3179                                            and quick_annotation_is_indexed
 3180                                        ):
 3181                                            annotation_tool = "bcftools"
 3182                                        elif quick_annotation_format in [
 3183                                            "vcf",
 3184                                            "bed",
 3185                                            "tsv",
 3186                                            "tsv",
 3187                                            "csv",
 3188                                            "json",
 3189                                            "tbl",
 3190                                            "parquet",
 3191                                            "duckdb",
 3192                                        ]:
 3193                                            annotation_tool = "parquet"
 3194                                        elif quick_annotation_format in ["bw"]:
 3195                                            annotation_tool = "bigwig"
 3196                                        else:
 3197                                            log.error(
 3198                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3199                                            )
 3200                                            raise ValueError(
 3201                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3202                                            )
 3203
 3204                                    log.debug(
 3205                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3206                                    )
 3207
 3208                                    # Annotation Tool dispatch
 3209                                    if annotation_tool:
 3210                                        if annotation_tool not in param["annotation"]:
 3211                                            param["annotation"][annotation_tool] = {}
 3212                                        if (
 3213                                            "annotations"
 3214                                            not in param["annotation"][annotation_tool]
 3215                                        ):
 3216                                            param["annotation"][annotation_tool][
 3217                                                "annotations"
 3218                                            ] = {}
 3219                                        param["annotation"][annotation_tool][
 3220                                            "annotations"
 3221                                        ][annotation_file_found] = annotations
 3222
 3223                                else:
 3224                                    log.warning(
 3225                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3226                                    )
 3227
 3228                self.set_param(param)
 3229
 3230        if param.get("annotation", None):
 3231            log.info("Annotations")
 3232            if param.get("annotation", {}).get("parquet", None):
 3233                log.info("Annotations 'parquet'...")
 3234                self.annotation_parquet()
 3235            if param.get("annotation", {}).get("bcftools", None):
 3236                log.info("Annotations 'bcftools'...")
 3237                self.annotation_bcftools()
 3238            if param.get("annotation", {}).get("snpsift", None):
 3239                log.info("Annotations 'snpsift'...")
 3240                self.annotation_snpsift()
 3241            if param.get("annotation", {}).get("bigwig", None):
 3242                log.info("Annotations 'bigwig'...")
 3243                self.annotation_bigwig()
 3244            if param.get("annotation", {}).get("annovar", None):
 3245                log.info("Annotations 'annovar'...")
 3246                self.annotation_annovar()
 3247            if param.get("annotation", {}).get("snpeff", None):
 3248                log.info("Annotations 'snpeff'...")
 3249                self.annotation_snpeff()
 3250            if param.get("annotation", {}).get("exomiser", None) is not None:
 3251                log.info("Annotations 'exomiser'...")
 3252                self.annotation_exomiser()
 3253            if param.get("annotation", {}).get("splice", None) is not None:
 3254                log.info("Annotations 'splice' ...")
 3255                self.annotation_splice()
 3256
 3257        # Explode INFOS fields into table fields
 3258        if self.get_explode_infos():
 3259            self.explode_infos(
 3260                prefix=self.get_explode_infos_prefix(),
 3261                fields=self.get_explode_infos_fields(),
 3262                force=True,
 3263            )
 3264
 3265    def annotation_bigwig(self, threads: int = None) -> None:
 3266        """
 3267        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3268
 3269        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3270        number of threads to be used for parallel processing during the annotation process. If the
 3271        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3272        threads to use based on the system configuration
 3273        :type threads: int
 3274        :return: True
 3275        """
 3276
 3277        # DEBUG
 3278        log.debug("Start annotation with bigwig databases")
 3279
 3280        # # Threads
 3281        # if not threads:
 3282        #     threads = self.get_threads()
 3283        # log.debug("Threads: " + str(threads))
 3284
 3285        # Config
 3286        config = self.get_config()
 3287        log.debug("Config: " + str(config))
 3288
 3289        # Config - BCFTools databases folders
 3290        databases_folders = set(
 3291            self.get_config()
 3292            .get("folders", {})
 3293            .get("databases", {})
 3294            .get("annotations", ["."])
 3295            + self.get_config()
 3296            .get("folders", {})
 3297            .get("databases", {})
 3298            .get("bigwig", ["."])
 3299        )
 3300        log.debug("Databases annotations: " + str(databases_folders))
 3301
 3302        # Param
 3303        annotations = (
 3304            self.get_param()
 3305            .get("annotation", {})
 3306            .get("bigwig", {})
 3307            .get("annotations", None)
 3308        )
 3309        log.debug("Annotations: " + str(annotations))
 3310
 3311        # Assembly
 3312        assembly = self.get_param().get(
 3313            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3314        )
 3315
 3316        # Data
 3317        table_variants = self.get_table_variants()
 3318
 3319        # Check if not empty
 3320        log.debug("Check if not empty")
 3321        sql_query_chromosomes = (
 3322            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3323        )
 3324        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3325        if not sql_query_chromosomes_df["count"][0]:
 3326            log.info(f"VCF empty")
 3327            return
 3328
 3329        # VCF header
 3330        vcf_reader = self.get_header()
 3331        log.debug("Initial header: " + str(vcf_reader.infos))
 3332
 3333        # Existing annotations
 3334        for vcf_annotation in self.get_header().infos:
 3335
 3336            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3337            log.debug(
 3338                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3339            )
 3340
 3341        if annotations:
 3342
 3343            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3344
 3345                # Export VCF file
 3346                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3347
 3348                # annotation_bigwig_config
 3349                annotation_bigwig_config_list = []
 3350
 3351                for annotation in annotations:
 3352                    annotation_fields = annotations[annotation]
 3353
 3354                    # Annotation Name
 3355                    annotation_name = os.path.basename(annotation)
 3356
 3357                    if not annotation_fields:
 3358                        annotation_fields = {"INFO": None}
 3359
 3360                    log.debug(f"Annotation '{annotation_name}'")
 3361                    log.debug(
 3362                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3363                    )
 3364
 3365                    # Create Database
 3366                    database = Database(
 3367                        database=annotation,
 3368                        databases_folders=databases_folders,
 3369                        assembly=assembly,
 3370                    )
 3371
 3372                    # Find files
 3373                    db_file = database.get_database()
 3374                    db_file = full_path(db_file)
 3375                    db_hdr_file = database.get_header_file()
 3376                    db_hdr_file = full_path(db_hdr_file)
 3377                    db_file_type = database.get_format()
 3378
 3379                    # If db_file is http ?
 3380                    if database.get_database().startswith("http"):
 3381
 3382                        # Datbase is HTTP URL
 3383                        db_file_is_http = True
 3384
 3385                        # DB file keep as URL
 3386                        db_file = database.get_database()
 3387                        log.warning(
 3388                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3389                        )
 3390
 3391                        # Retrieve automatic annotation field name
 3392                        annotation_field = clean_annotation_field(
 3393                            os.path.basename(db_file).replace(".bw", "")
 3394                        )
 3395                        log.debug(
 3396                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3397                        )
 3398
 3399                        # Create automatic header file
 3400                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3401                        with open(db_hdr_file, "w") as f:
 3402                            f.write("##fileformat=VCFv4.2\n")
 3403                            f.write(
 3404                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3405                            )
 3406                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3407
 3408                    else:
 3409
 3410                        # Datbase is NOT HTTP URL
 3411                        db_file_is_http = False
 3412
 3413                    # Check index - try to create if not exists
 3414                    if (
 3415                        db_file is None
 3416                        or db_hdr_file is None
 3417                        or (not os.path.exists(db_file) and not db_file_is_http)
 3418                        or not os.path.exists(db_hdr_file)
 3419                        or not db_file_type in ["bw"]
 3420                    ):
 3421                        # if False:
 3422                        log.error("Annotation failed: database not valid")
 3423                        log.error(f"Annotation annotation file: {db_file}")
 3424                        log.error(f"Annotation annotation file type: {db_file_type}")
 3425                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3426                        raise ValueError(
 3427                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3428                        )
 3429                    else:
 3430
 3431                        # Log
 3432                        log.debug(
 3433                            f"Annotation '{annotation}' - file: "
 3434                            + str(db_file)
 3435                            + " and "
 3436                            + str(db_hdr_file)
 3437                        )
 3438
 3439                        # Load header as VCF object
 3440                        db_hdr_vcf = Variants(input=db_hdr_file)
 3441                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3442                        log.debug(
 3443                            "Annotation database header: "
 3444                            + str(db_hdr_vcf_header_infos)
 3445                        )
 3446
 3447                        # For all fields in database
 3448                        annotation_fields_full = False
 3449                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3450                            annotation_fields = {
 3451                                key: key for key in db_hdr_vcf_header_infos
 3452                            }
 3453                            log.debug(
 3454                                "Annotation database header - All annotations added: "
 3455                                + str(annotation_fields)
 3456                            )
 3457                            annotation_fields_full = True
 3458
 3459                        # Init
 3460                        cyvcf2_header_rename_dict = {}
 3461                        cyvcf2_header_list = []
 3462                        cyvcf2_header_indexes = {}
 3463
 3464                        # process annotation fields
 3465                        for annotation_field in annotation_fields:
 3466
 3467                            # New annotation name
 3468                            annotation_field_new = annotation_fields[annotation_field]
 3469
 3470                            # Check annotation field and index in header
 3471                            if (
 3472                                annotation_field
 3473                                in db_hdr_vcf.get_header_columns_as_list()
 3474                            ):
 3475                                annotation_field_index = (
 3476                                    db_hdr_vcf.get_header_columns_as_list().index(
 3477                                        annotation_field
 3478                                    )
 3479                                    - 3
 3480                                )
 3481                                cyvcf2_header_indexes[annotation_field_new] = (
 3482                                    annotation_field_index
 3483                                )
 3484                            else:
 3485                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3486                                log.error(msg_err)
 3487                                raise ValueError(msg_err)
 3488
 3489                            # Append annotation field in cyvcf2 header list
 3490                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3491                                db_hdr_vcf_header_infos[annotation_field].id
 3492                            )
 3493                            cyvcf2_header_list.append(
 3494                                {
 3495                                    "ID": annotation_field_new,
 3496                                    "Number": db_hdr_vcf_header_infos[
 3497                                        annotation_field
 3498                                    ].num,
 3499                                    "Type": db_hdr_vcf_header_infos[
 3500                                        annotation_field
 3501                                    ].type,
 3502                                    "Description": db_hdr_vcf_header_infos[
 3503                                        annotation_field
 3504                                    ].desc,
 3505                                }
 3506                            )
 3507
 3508                            # Add header on VCF
 3509                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3510                                annotation_field_new,
 3511                                db_hdr_vcf_header_infos[annotation_field].num,
 3512                                db_hdr_vcf_header_infos[annotation_field].type,
 3513                                db_hdr_vcf_header_infos[annotation_field].desc,
 3514                                "HOWARD BigWig annotation",
 3515                                "unknown",
 3516                                self.code_type_map[
 3517                                    db_hdr_vcf_header_infos[annotation_field].type
 3518                                ],
 3519                            )
 3520
 3521                        # Load bigwig database
 3522                        bw_db = pyBigWig.open(db_file)
 3523                        if bw_db.isBigWig():
 3524                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3525                        else:
 3526                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3527                            log.error(msg_err)
 3528                            raise ValueError(msg_err)
 3529
 3530                        annotation_bigwig_config_list.append(
 3531                            {
 3532                                "db_file": db_file,
 3533                                "bw_db": bw_db,
 3534                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3535                                "cyvcf2_header_list": cyvcf2_header_list,
 3536                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3537                            }
 3538                        )
 3539
 3540                # Annotate
 3541                if annotation_bigwig_config_list:
 3542
 3543                    # Annotation config
 3544                    log.debug(
 3545                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3546                    )
 3547
 3548                    # Export VCF file
 3549                    self.export_variant_vcf(
 3550                        vcf_file=tmp_vcf_name,
 3551                        remove_info=True,
 3552                        add_samples=False,
 3553                        index=True,
 3554                    )
 3555
 3556                    # Load input tmp file
 3557                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3558
 3559                    # Add header in input file
 3560                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3561                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3562                            "cyvcf2_header_list", []
 3563                        ):
 3564                            log.info(
 3565                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3566                            )
 3567                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3568
 3569                    # Create output VCF file
 3570                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3571                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3572
 3573                    # Fetch variants
 3574                    log.info(f"Annotations 'bigwig' start...")
 3575                    for variant in input_vcf:
 3576
 3577                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3578
 3579                            # DB and indexes
 3580                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3581                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3582                                "cyvcf2_header_indexes", None
 3583                            )
 3584
 3585                            # Retrieve value from chrom pos
 3586                            res = bw_db.values(
 3587                                variant.CHROM, variant.POS - 1, variant.POS
 3588                            )
 3589
 3590                            # For each annotation fields (and indexes)
 3591                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3592
 3593                                # If value is NOT nNone
 3594                                if not np.isnan(
 3595                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3596                                ):
 3597                                    variant.INFO[cyvcf2_header_index] = res[
 3598                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3599                                    ]
 3600
 3601                        # Add record in output file
 3602                        output_vcf.write_record(variant)
 3603
 3604                    # Log
 3605                    log.debug(f"Annotation done.")
 3606
 3607                    # Close and write file
 3608                    log.info(f"Annotations 'bigwig' write...")
 3609                    output_vcf.close()
 3610                    log.debug(f"Write done.")
 3611
 3612                    # Update variants
 3613                    log.info(f"Annotations 'bigwig' update...")
 3614                    self.update_from_vcf(output_vcf_file)
 3615                    log.debug(f"Update done.")
 3616
 3617        return True
 3618
 3619    def annotation_snpsift(self, threads: int = None) -> None:
 3620        """
 3621        This function annotate with bcftools
 3622
 3623        :param threads: Number of threads to use
 3624        :return: the value of the variable "return_value".
 3625        """
 3626
 3627        # DEBUG
 3628        log.debug("Start annotation with bcftools databases")
 3629
 3630        # Threads
 3631        if not threads:
 3632            threads = self.get_threads()
 3633        log.debug("Threads: " + str(threads))
 3634
 3635        # Config
 3636        config = self.get_config()
 3637        log.debug("Config: " + str(config))
 3638
 3639        # Config - snpSift
 3640        snpsift_bin_command = get_bin_command(
 3641            bin="SnpSift.jar",
 3642            tool="snpsift",
 3643            bin_type="jar",
 3644            config=config,
 3645            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3646        )
 3647        if not snpsift_bin_command:
 3648            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3649            log.error(msg_err)
 3650            raise ValueError(msg_err)
 3651
 3652        # Config - bcftools
 3653        bcftools_bin_command = get_bin_command(
 3654            bin="bcftools",
 3655            tool="bcftools",
 3656            bin_type="bin",
 3657            config=config,
 3658            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3659        )
 3660        if not bcftools_bin_command:
 3661            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3662            log.error(msg_err)
 3663            raise ValueError(msg_err)
 3664
 3665        # Config - BCFTools databases folders
 3666        databases_folders = set(
 3667            self.get_config()
 3668            .get("folders", {})
 3669            .get("databases", {})
 3670            .get("annotations", ["."])
 3671            + self.get_config()
 3672            .get("folders", {})
 3673            .get("databases", {})
 3674            .get("bcftools", ["."])
 3675        )
 3676        log.debug("Databases annotations: " + str(databases_folders))
 3677
 3678        # Param
 3679        annotations = (
 3680            self.get_param()
 3681            .get("annotation", {})
 3682            .get("snpsift", {})
 3683            .get("annotations", None)
 3684        )
 3685        log.debug("Annotations: " + str(annotations))
 3686
 3687        # Assembly
 3688        assembly = self.get_param().get(
 3689            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3690        )
 3691
 3692        # Data
 3693        table_variants = self.get_table_variants()
 3694
 3695        # Check if not empty
 3696        log.debug("Check if not empty")
 3697        sql_query_chromosomes = (
 3698            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3699        )
 3700        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3701        if not sql_query_chromosomes_df["count"][0]:
 3702            log.info(f"VCF empty")
 3703            return
 3704
 3705        # VCF header
 3706        vcf_reader = self.get_header()
 3707        log.debug("Initial header: " + str(vcf_reader.infos))
 3708
 3709        # Existing annotations
 3710        for vcf_annotation in self.get_header().infos:
 3711
 3712            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3713            log.debug(
 3714                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3715            )
 3716
 3717        if annotations:
 3718
 3719            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3720
 3721                # Export VCF file
 3722                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3723
 3724                # Init
 3725                commands = {}
 3726
 3727                for annotation in annotations:
 3728                    annotation_fields = annotations[annotation]
 3729
 3730                    # Annotation Name
 3731                    annotation_name = os.path.basename(annotation)
 3732
 3733                    if not annotation_fields:
 3734                        annotation_fields = {"INFO": None}
 3735
 3736                    log.debug(f"Annotation '{annotation_name}'")
 3737                    log.debug(
 3738                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3739                    )
 3740
 3741                    # Create Database
 3742                    database = Database(
 3743                        database=annotation,
 3744                        databases_folders=databases_folders,
 3745                        assembly=assembly,
 3746                    )
 3747
 3748                    # Find files
 3749                    db_file = database.get_database()
 3750                    db_file = full_path(db_file)
 3751                    db_hdr_file = database.get_header_file()
 3752                    db_hdr_file = full_path(db_hdr_file)
 3753                    db_file_type = database.get_format()
 3754                    db_tbi_file = f"{db_file}.tbi"
 3755                    db_file_compressed = database.is_compressed()
 3756
 3757                    # Check if compressed
 3758                    if not db_file_compressed:
 3759                        log.error(
 3760                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3761                        )
 3762                        raise ValueError(
 3763                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3764                        )
 3765
 3766                    # Check if indexed
 3767                    if not os.path.exists(db_tbi_file):
 3768                        log.error(
 3769                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3770                        )
 3771                        raise ValueError(
 3772                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3773                        )
 3774
 3775                    # Check index - try to create if not exists
 3776                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3777                        log.error("Annotation failed: database not valid")
 3778                        log.error(f"Annotation annotation file: {db_file}")
 3779                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3780                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3781                        raise ValueError(
 3782                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3783                        )
 3784                    else:
 3785
 3786                        log.debug(
 3787                            f"Annotation '{annotation}' - file: "
 3788                            + str(db_file)
 3789                            + " and "
 3790                            + str(db_hdr_file)
 3791                        )
 3792
 3793                        # Load header as VCF object
 3794                        db_hdr_vcf = Variants(input=db_hdr_file)
 3795                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3796                        log.debug(
 3797                            "Annotation database header: "
 3798                            + str(db_hdr_vcf_header_infos)
 3799                        )
 3800
 3801                        # For all fields in database
 3802                        annotation_fields_full = False
 3803                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3804                            annotation_fields = {
 3805                                key: key for key in db_hdr_vcf_header_infos
 3806                            }
 3807                            log.debug(
 3808                                "Annotation database header - All annotations added: "
 3809                                + str(annotation_fields)
 3810                            )
 3811                            annotation_fields_full = True
 3812
 3813                        # # Create file for field rename
 3814                        # log.debug("Create file for field rename")
 3815                        # tmp_rename = NamedTemporaryFile(
 3816                        #     prefix=self.get_prefix(),
 3817                        #     dir=self.get_tmp_dir(),
 3818                        #     suffix=".rename",
 3819                        #     delete=False,
 3820                        # )
 3821                        # tmp_rename_name = tmp_rename.name
 3822                        # tmp_files.append(tmp_rename_name)
 3823
 3824                        # Number of fields
 3825                        nb_annotation_field = 0
 3826                        annotation_list = []
 3827                        annotation_infos_rename_list = []
 3828
 3829                        for annotation_field in annotation_fields:
 3830
 3831                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3832                            annotation_fields_new_name = annotation_fields.get(
 3833                                annotation_field, annotation_field
 3834                            )
 3835                            if not annotation_fields_new_name:
 3836                                annotation_fields_new_name = annotation_field
 3837
 3838                            # Check if field is in DB and if field is not elready in input data
 3839                            if (
 3840                                annotation_field in db_hdr_vcf.get_header().infos
 3841                                and annotation_fields_new_name
 3842                                not in self.get_header().infos
 3843                            ):
 3844
 3845                                log.info(
 3846                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3847                                )
 3848
 3849                                # BCFTools annotate param to rename fields
 3850                                if annotation_field != annotation_fields_new_name:
 3851                                    annotation_infos_rename_list.append(
 3852                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3853                                    )
 3854
 3855                                # Add INFO field to header
 3856                                db_hdr_vcf_header_infos_number = (
 3857                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3858                                )
 3859                                db_hdr_vcf_header_infos_type = (
 3860                                    db_hdr_vcf_header_infos[annotation_field].type
 3861                                    or "String"
 3862                                )
 3863                                db_hdr_vcf_header_infos_description = (
 3864                                    db_hdr_vcf_header_infos[annotation_field].desc
 3865                                    or f"{annotation_field} description"
 3866                                )
 3867                                db_hdr_vcf_header_infos_source = (
 3868                                    db_hdr_vcf_header_infos[annotation_field].source
 3869                                    or "unknown"
 3870                                )
 3871                                db_hdr_vcf_header_infos_version = (
 3872                                    db_hdr_vcf_header_infos[annotation_field].version
 3873                                    or "unknown"
 3874                                )
 3875
 3876                                vcf_reader.infos[annotation_fields_new_name] = (
 3877                                    vcf.parser._Info(
 3878                                        annotation_fields_new_name,
 3879                                        db_hdr_vcf_header_infos_number,
 3880                                        db_hdr_vcf_header_infos_type,
 3881                                        db_hdr_vcf_header_infos_description,
 3882                                        db_hdr_vcf_header_infos_source,
 3883                                        db_hdr_vcf_header_infos_version,
 3884                                        self.code_type_map[
 3885                                            db_hdr_vcf_header_infos_type
 3886                                        ],
 3887                                    )
 3888                                )
 3889
 3890                                annotation_list.append(annotation_field)
 3891
 3892                                nb_annotation_field += 1
 3893
 3894                            else:
 3895
 3896                                if (
 3897                                    annotation_field
 3898                                    not in db_hdr_vcf.get_header().infos
 3899                                ):
 3900                                    log.warning(
 3901                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3902                                    )
 3903                                if (
 3904                                    annotation_fields_new_name
 3905                                    in self.get_header().infos
 3906                                ):
 3907                                    log.warning(
 3908                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3909                                    )
 3910
 3911                        log.info(
 3912                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3913                        )
 3914
 3915                        annotation_infos = ",".join(annotation_list)
 3916
 3917                        if annotation_infos != "":
 3918
 3919                            # Annotated VCF (and error file)
 3920                            tmp_annotation_vcf_name = os.path.join(
 3921                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3922                            )
 3923                            tmp_annotation_vcf_name_err = (
 3924                                tmp_annotation_vcf_name + ".err"
 3925                            )
 3926
 3927                            # Add fields to annotate
 3928                            if not annotation_fields_full:
 3929                                annotation_infos_option = f"-info {annotation_infos}"
 3930                            else:
 3931                                annotation_infos_option = ""
 3932
 3933                            # Info fields rename
 3934                            if annotation_infos_rename_list:
 3935                                annotation_infos_rename = " -c " + ",".join(
 3936                                    annotation_infos_rename_list
 3937                                )
 3938                            else:
 3939                                annotation_infos_rename = ""
 3940
 3941                            # Annotate command
 3942                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3943
 3944                            # Add command
 3945                            commands[command_annotate] = tmp_annotation_vcf_name
 3946
 3947                if commands:
 3948
 3949                    # Export VCF file
 3950                    self.export_variant_vcf(
 3951                        vcf_file=tmp_vcf_name,
 3952                        remove_info=True,
 3953                        add_samples=False,
 3954                        index=True,
 3955                    )
 3956                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3957
 3958                    # Num command
 3959                    nb_command = 0
 3960
 3961                    # Annotate
 3962                    for command_annotate in commands:
 3963                        nb_command += 1
 3964                        log.info(
 3965                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3966                        )
 3967                        log.debug(f"command_annotate={command_annotate}")
 3968                        run_parallel_commands([command_annotate], threads)
 3969
 3970                        # Debug
 3971                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3972
 3973                        # Update variants
 3974                        log.info(
 3975                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3976                        )
 3977                        self.update_from_vcf(commands[command_annotate])
 3978
 3979    def annotation_bcftools(self, threads: int = None) -> None:
 3980        """
 3981        This function annotate with bcftools
 3982
 3983        :param threads: Number of threads to use
 3984        :return: the value of the variable "return_value".
 3985        """
 3986
 3987        # DEBUG
 3988        log.debug("Start annotation with bcftools databases")
 3989
 3990        # Threads
 3991        if not threads:
 3992            threads = self.get_threads()
 3993        log.debug("Threads: " + str(threads))
 3994
 3995        # Config
 3996        config = self.get_config()
 3997        log.debug("Config: " + str(config))
 3998
 3999        # DEBUG
 4000        delete_tmp = True
 4001        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4002            delete_tmp = False
 4003            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4004
 4005        # Config - BCFTools bin command
 4006        bcftools_bin_command = get_bin_command(
 4007            bin="bcftools",
 4008            tool="bcftools",
 4009            bin_type="bin",
 4010            config=config,
 4011            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4012        )
 4013        if not bcftools_bin_command:
 4014            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4015            log.error(msg_err)
 4016            raise ValueError(msg_err)
 4017
 4018        # Config - BCFTools databases folders
 4019        databases_folders = set(
 4020            self.get_config()
 4021            .get("folders", {})
 4022            .get("databases", {})
 4023            .get("annotations", ["."])
 4024            + self.get_config()
 4025            .get("folders", {})
 4026            .get("databases", {})
 4027            .get("bcftools", ["."])
 4028        )
 4029        log.debug("Databases annotations: " + str(databases_folders))
 4030
 4031        # Param
 4032        annotations = (
 4033            self.get_param()
 4034            .get("annotation", {})
 4035            .get("bcftools", {})
 4036            .get("annotations", None)
 4037        )
 4038        log.debug("Annotations: " + str(annotations))
 4039
 4040        # Assembly
 4041        assembly = self.get_param().get(
 4042            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4043        )
 4044
 4045        # Data
 4046        table_variants = self.get_table_variants()
 4047
 4048        # Check if not empty
 4049        log.debug("Check if not empty")
 4050        sql_query_chromosomes = (
 4051            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4052        )
 4053        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4054        if not sql_query_chromosomes_df["count"][0]:
 4055            log.info(f"VCF empty")
 4056            return
 4057
 4058        # Export in VCF
 4059        log.debug("Create initial file to annotate")
 4060        tmp_vcf = NamedTemporaryFile(
 4061            prefix=self.get_prefix(),
 4062            dir=self.get_tmp_dir(),
 4063            suffix=".vcf.gz",
 4064            delete=False,
 4065        )
 4066        tmp_vcf_name = tmp_vcf.name
 4067
 4068        # VCF header
 4069        vcf_reader = self.get_header()
 4070        log.debug("Initial header: " + str(vcf_reader.infos))
 4071
 4072        # Existing annotations
 4073        for vcf_annotation in self.get_header().infos:
 4074
 4075            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4076            log.debug(
 4077                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4078            )
 4079
 4080        if annotations:
 4081
 4082            tmp_ann_vcf_list = []
 4083            commands = []
 4084            tmp_files = []
 4085            err_files = []
 4086
 4087            for annotation in annotations:
 4088                annotation_fields = annotations[annotation]
 4089
 4090                # Annotation Name
 4091                annotation_name = os.path.basename(annotation)
 4092
 4093                if not annotation_fields:
 4094                    annotation_fields = {"INFO": None}
 4095
 4096                log.debug(f"Annotation '{annotation_name}'")
 4097                log.debug(
 4098                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4099                )
 4100
 4101                # Create Database
 4102                database = Database(
 4103                    database=annotation,
 4104                    databases_folders=databases_folders,
 4105                    assembly=assembly,
 4106                )
 4107
 4108                # Find files
 4109                db_file = database.get_database()
 4110                db_file = full_path(db_file)
 4111                db_hdr_file = database.get_header_file()
 4112                db_hdr_file = full_path(db_hdr_file)
 4113                db_file_type = database.get_format()
 4114                db_tbi_file = f"{db_file}.tbi"
 4115                db_file_compressed = database.is_compressed()
 4116
 4117                # Check if compressed
 4118                if not db_file_compressed:
 4119                    log.error(
 4120                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4121                    )
 4122                    raise ValueError(
 4123                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4124                    )
 4125
 4126                # Check if indexed
 4127                if not os.path.exists(db_tbi_file):
 4128                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4129                    raise ValueError(
 4130                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4131                    )
 4132
 4133                # Check index - try to create if not exists
 4134                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4135                    log.error("Annotation failed: database not valid")
 4136                    log.error(f"Annotation annotation file: {db_file}")
 4137                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4138                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4139                    raise ValueError(
 4140                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4141                    )
 4142                else:
 4143
 4144                    log.debug(
 4145                        f"Annotation '{annotation}' - file: "
 4146                        + str(db_file)
 4147                        + " and "
 4148                        + str(db_hdr_file)
 4149                    )
 4150
 4151                    # Load header as VCF object
 4152                    db_hdr_vcf = Variants(input=db_hdr_file)
 4153                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4154                    log.debug(
 4155                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4156                    )
 4157
 4158                    # For all fields in database
 4159                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4160                        annotation_fields = {
 4161                            key: key for key in db_hdr_vcf_header_infos
 4162                        }
 4163                        log.debug(
 4164                            "Annotation database header - All annotations added: "
 4165                            + str(annotation_fields)
 4166                        )
 4167
 4168                    # Number of fields
 4169                    nb_annotation_field = 0
 4170                    annotation_list = []
 4171
 4172                    for annotation_field in annotation_fields:
 4173
 4174                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4175                        annotation_fields_new_name = annotation_fields.get(
 4176                            annotation_field, annotation_field
 4177                        )
 4178                        if not annotation_fields_new_name:
 4179                            annotation_fields_new_name = annotation_field
 4180
 4181                        # Check if field is in DB and if field is not elready in input data
 4182                        if (
 4183                            annotation_field in db_hdr_vcf.get_header().infos
 4184                            and annotation_fields_new_name
 4185                            not in self.get_header().infos
 4186                        ):
 4187
 4188                            log.info(
 4189                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4190                            )
 4191
 4192                            # Add INFO field to header
 4193                            db_hdr_vcf_header_infos_number = (
 4194                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4195                            )
 4196                            db_hdr_vcf_header_infos_type = (
 4197                                db_hdr_vcf_header_infos[annotation_field].type
 4198                                or "String"
 4199                            )
 4200                            db_hdr_vcf_header_infos_description = (
 4201                                db_hdr_vcf_header_infos[annotation_field].desc
 4202                                or f"{annotation_field} description"
 4203                            )
 4204                            db_hdr_vcf_header_infos_source = (
 4205                                db_hdr_vcf_header_infos[annotation_field].source
 4206                                or "unknown"
 4207                            )
 4208                            db_hdr_vcf_header_infos_version = (
 4209                                db_hdr_vcf_header_infos[annotation_field].version
 4210                                or "unknown"
 4211                            )
 4212
 4213                            vcf_reader.infos[annotation_fields_new_name] = (
 4214                                vcf.parser._Info(
 4215                                    annotation_fields_new_name,
 4216                                    db_hdr_vcf_header_infos_number,
 4217                                    db_hdr_vcf_header_infos_type,
 4218                                    db_hdr_vcf_header_infos_description,
 4219                                    db_hdr_vcf_header_infos_source,
 4220                                    db_hdr_vcf_header_infos_version,
 4221                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4222                                )
 4223                            )
 4224
 4225                            # annotation_list.append(annotation_field)
 4226                            if annotation_field != annotation_fields_new_name:
 4227                                annotation_list.append(
 4228                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4229                                )
 4230                            else:
 4231                                annotation_list.append(annotation_field)
 4232
 4233                            nb_annotation_field += 1
 4234
 4235                        else:
 4236
 4237                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4238                                log.warning(
 4239                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4240                                )
 4241                            if annotation_fields_new_name in self.get_header().infos:
 4242                                log.warning(
 4243                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4244                                )
 4245
 4246                    log.info(
 4247                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4248                    )
 4249
 4250                    annotation_infos = ",".join(annotation_list)
 4251
 4252                    if annotation_infos != "":
 4253
 4254                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4255                        log.debug("Protect Header file - remove #CHROM line if exists")
 4256                        tmp_header_vcf = NamedTemporaryFile(
 4257                            prefix=self.get_prefix(),
 4258                            dir=self.get_tmp_dir(),
 4259                            suffix=".hdr",
 4260                            delete=False,
 4261                        )
 4262                        tmp_header_vcf_name = tmp_header_vcf.name
 4263                        tmp_files.append(tmp_header_vcf_name)
 4264                        # Command
 4265                        if db_hdr_file.endswith(".gz"):
 4266                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4267                        else:
 4268                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4269                        # Run
 4270                        run_parallel_commands([command_extract_header], 1)
 4271
 4272                        # Find chomosomes
 4273                        log.debug("Find chromosomes ")
 4274                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4275                        sql_query_chromosomes_df = self.get_query_to_df(
 4276                            sql_query_chromosomes
 4277                        )
 4278                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4279
 4280                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4281
 4282                        # BED columns in the annotation file
 4283                        if db_file_type in ["bed"]:
 4284                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4285
 4286                        for chrom in chomosomes_list:
 4287
 4288                            # Create BED on initial VCF
 4289                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4290                            tmp_bed = NamedTemporaryFile(
 4291                                prefix=self.get_prefix(),
 4292                                dir=self.get_tmp_dir(),
 4293                                suffix=".bed",
 4294                                delete=False,
 4295                            )
 4296                            tmp_bed_name = tmp_bed.name
 4297                            tmp_files.append(tmp_bed_name)
 4298
 4299                            # Detecte regions
 4300                            log.debug(
 4301                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4302                            )
 4303                            window = 1000000
 4304                            sql_query_intervals_for_bed = f"""
 4305                                SELECT  \"#CHROM\",
 4306                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4307                                        \"POS\"+{window}
 4308                                FROM {table_variants} as table_variants
 4309                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4310                            """
 4311                            regions = self.conn.execute(
 4312                                sql_query_intervals_for_bed
 4313                            ).fetchall()
 4314                            merged_regions = merge_regions(regions)
 4315                            log.debug(
 4316                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4317                            )
 4318
 4319                            header = ["#CHROM", "START", "END"]
 4320                            with open(tmp_bed_name, "w") as f:
 4321                                # Write the header with tab delimiter
 4322                                f.write("\t".join(header) + "\n")
 4323                                for d in merged_regions:
 4324                                    # Write each data row with tab delimiter
 4325                                    f.write("\t".join(map(str, d)) + "\n")
 4326
 4327                            # Tmp files
 4328                            tmp_annotation_vcf = NamedTemporaryFile(
 4329                                prefix=self.get_prefix(),
 4330                                dir=self.get_tmp_dir(),
 4331                                suffix=".vcf.gz",
 4332                                delete=False,
 4333                            )
 4334                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4335                            tmp_files.append(tmp_annotation_vcf_name)
 4336                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4337                            tmp_annotation_vcf_name_err = (
 4338                                tmp_annotation_vcf_name + ".err"
 4339                            )
 4340                            err_files.append(tmp_annotation_vcf_name_err)
 4341
 4342                            # Annotate Command
 4343                            log.debug(
 4344                                f"Annotation '{annotation}' - add bcftools command"
 4345                            )
 4346
 4347                            # Command
 4348                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4349
 4350                            # Add command
 4351                            commands.append(command_annotate)
 4352
 4353            # if some commands
 4354            if commands:
 4355
 4356                # Export VCF file
 4357                self.export_variant_vcf(
 4358                    vcf_file=tmp_vcf_name,
 4359                    remove_info=True,
 4360                    add_samples=False,
 4361                    index=True,
 4362                )
 4363
 4364                # Threads
 4365                # calculate threads for annotated commands
 4366                if commands:
 4367                    threads_bcftools_annotate = round(threads / len(commands))
 4368                else:
 4369                    threads_bcftools_annotate = 1
 4370
 4371                if not threads_bcftools_annotate:
 4372                    threads_bcftools_annotate = 1
 4373
 4374                # Add threads option to bcftools commands
 4375                if threads_bcftools_annotate > 1:
 4376                    commands_threaded = []
 4377                    for command in commands:
 4378                        commands_threaded.append(
 4379                            command.replace(
 4380                                f"{bcftools_bin_command} annotate ",
 4381                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4382                            )
 4383                        )
 4384                    commands = commands_threaded
 4385
 4386                # Command annotation multithreading
 4387                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4388                log.info(
 4389                    f"Annotation - Annotation multithreaded in "
 4390                    + str(len(commands))
 4391                    + " commands"
 4392                )
 4393
 4394                run_parallel_commands(commands, threads)
 4395
 4396                # Merge
 4397                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4398
 4399                if tmp_ann_vcf_list_cmd:
 4400
 4401                    # Tmp file
 4402                    tmp_annotate_vcf = NamedTemporaryFile(
 4403                        prefix=self.get_prefix(),
 4404                        dir=self.get_tmp_dir(),
 4405                        suffix=".vcf.gz",
 4406                        delete=True,
 4407                    )
 4408                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4409                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4410                    err_files.append(tmp_annotate_vcf_name_err)
 4411
 4412                    # Tmp file remove command
 4413                    tmp_files_remove_command = ""
 4414                    if tmp_files:
 4415                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4416
 4417                    # Command merge
 4418                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4419                    log.info(
 4420                        f"Annotation - Annotation merging "
 4421                        + str(len(commands))
 4422                        + " annotated files"
 4423                    )
 4424                    log.debug(f"Annotation - merge command: {merge_command}")
 4425                    run_parallel_commands([merge_command], 1)
 4426
 4427                    # Error messages
 4428                    log.info(f"Error/Warning messages:")
 4429                    error_message_command_all = []
 4430                    error_message_command_warning = []
 4431                    error_message_command_err = []
 4432                    for err_file in err_files:
 4433                        with open(err_file, "r") as f:
 4434                            for line in f:
 4435                                message = line.strip()
 4436                                error_message_command_all.append(message)
 4437                                if line.startswith("[W::"):
 4438                                    error_message_command_warning.append(message)
 4439                                if line.startswith("[E::"):
 4440                                    error_message_command_err.append(
 4441                                        f"{err_file}: " + message
 4442                                    )
 4443                    # log info
 4444                    for message in list(
 4445                        set(error_message_command_err + error_message_command_warning)
 4446                    ):
 4447                        log.info(f"   {message}")
 4448                    # debug info
 4449                    for message in list(set(error_message_command_all)):
 4450                        log.debug(f"   {message}")
 4451                    # failed
 4452                    if len(error_message_command_err):
 4453                        log.error("Annotation failed: Error in commands")
 4454                        raise ValueError("Annotation failed: Error in commands")
 4455
 4456                    # Update variants
 4457                    log.info(f"Annotation - Updating...")
 4458                    self.update_from_vcf(tmp_annotate_vcf_name)
 4459
 4460    def annotation_exomiser(self, threads: int = None) -> None:
 4461        """
 4462        This function annotate with Exomiser
 4463
 4464        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4465        - "analysis" (dict/file):
 4466            Full analysis dictionnary parameters (see Exomiser docs).
 4467            Either a dict, or a file in JSON or YAML format.
 4468            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4469            Default : None
 4470        - "preset" (string):
 4471            Analysis preset (available in config folder).
 4472            Used if no full "analysis" is provided.
 4473            Default: "exome"
 4474        - "phenopacket" (dict/file):
 4475            Samples and phenotipic features parameters (see Exomiser docs).
 4476            Either a dict, or a file in JSON or YAML format.
 4477            Default: None
 4478        - "subject" (dict):
 4479            Sample parameters (see Exomiser docs).
 4480            Example:
 4481                "subject":
 4482                    {
 4483                        "id": "ISDBM322017",
 4484                        "sex": "FEMALE"
 4485                    }
 4486            Default: None
 4487        - "sample" (string):
 4488            Sample name to construct "subject" section:
 4489                "subject":
 4490                    {
 4491                        "id": "<sample>",
 4492                        "sex": "UNKNOWN_SEX"
 4493                    }
 4494            Default: None
 4495        - "phenotypicFeatures" (dict)
 4496            Phenotypic features to construct "subject" section.
 4497            Example:
 4498                "phenotypicFeatures":
 4499                    [
 4500                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4501                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4502                    ]
 4503        - "hpo" (list)
 4504            List of HPO ids as phenotypic features.
 4505            Example:
 4506                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4507            Default: []
 4508        - "outputOptions" (dict):
 4509            Output options (see Exomiser docs).
 4510            Default:
 4511                "output_options" =
 4512                    {
 4513                        "outputContributingVariantsOnly": False,
 4514                        "numGenes": 0,
 4515                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4516                    }
 4517        - "transcript_source" (string):
 4518            Transcript source (either "refseq", "ucsc", "ensembl")
 4519            Default: "refseq"
 4520        - "exomiser_to_info" (boolean):
 4521            Add exomiser TSV file columns as INFO fields in VCF.
 4522            Default: False
 4523        - "release" (string):
 4524            Exomise database release.
 4525            If not exists, database release will be downloaded (take a while).
 4526            Default: None (provided by application.properties configuration file)
 4527        - "exomiser_application_properties" (file):
 4528            Exomiser configuration file (see Exomiser docs).
 4529            Useful to automatically download databases (especially for specific genome databases).
 4530
 4531        Notes:
 4532        - If no sample in parameters, first sample in VCF will be chosen
 4533        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4534
 4535        :param threads: The number of threads to use
 4536        :return: None.
 4537        """
 4538
 4539        # DEBUG
 4540        log.debug("Start annotation with Exomiser databases")
 4541
 4542        # Threads
 4543        if not threads:
 4544            threads = self.get_threads()
 4545        log.debug("Threads: " + str(threads))
 4546
 4547        # Config
 4548        config = self.get_config()
 4549        log.debug("Config: " + str(config))
 4550
 4551        # Config - Folders - Databases
 4552        databases_folders = (
 4553            config.get("folders", {})
 4554            .get("databases", {})
 4555            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4556        )
 4557        databases_folders = full_path(databases_folders)
 4558        if not os.path.exists(databases_folders):
 4559            log.error(f"Databases annotations: {databases_folders} NOT found")
 4560        log.debug("Databases annotations: " + str(databases_folders))
 4561
 4562        # Config - Exomiser
 4563        exomiser_bin_command = get_bin_command(
 4564            bin="exomiser-cli*.jar",
 4565            tool="exomiser",
 4566            bin_type="jar",
 4567            config=config,
 4568            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4569        )
 4570        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4571        if not exomiser_bin_command:
 4572            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4573            log.error(msg_err)
 4574            raise ValueError(msg_err)
 4575
 4576        # Param
 4577        param = self.get_param()
 4578        log.debug("Param: " + str(param))
 4579
 4580        # Param - Exomiser
 4581        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4582        log.debug(f"Param Exomiser: {param_exomiser}")
 4583
 4584        # Param - Assembly
 4585        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4586        log.debug("Assembly: " + str(assembly))
 4587
 4588        # Data
 4589        table_variants = self.get_table_variants()
 4590
 4591        # Check if not empty
 4592        log.debug("Check if not empty")
 4593        sql_query_chromosomes = (
 4594            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4595        )
 4596        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4597            log.info(f"VCF empty")
 4598            return False
 4599
 4600        # VCF header
 4601        vcf_reader = self.get_header()
 4602        log.debug("Initial header: " + str(vcf_reader.infos))
 4603
 4604        # Samples
 4605        samples = self.get_header_sample_list()
 4606        if not samples:
 4607            log.error("No Samples in VCF")
 4608            return False
 4609        log.debug(f"Samples: {samples}")
 4610
 4611        # Memory limit
 4612        memory_limit = self.get_memory("8G")
 4613        log.debug(f"memory_limit: {memory_limit}")
 4614
 4615        # Exomiser java options
 4616        exomiser_java_options = (
 4617            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4618        )
 4619        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4620
 4621        # Download Exomiser (if not exists)
 4622        exomiser_release = param_exomiser.get("release", None)
 4623        exomiser_application_properties = param_exomiser.get(
 4624            "exomiser_application_properties", None
 4625        )
 4626        databases_download_exomiser(
 4627            assemblies=[assembly],
 4628            exomiser_folder=databases_folders,
 4629            exomiser_release=exomiser_release,
 4630            exomiser_phenotype_release=exomiser_release,
 4631            exomiser_application_properties=exomiser_application_properties,
 4632        )
 4633
 4634        # Force annotation
 4635        force_update_annotation = True
 4636
 4637        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4638            log.debug("Start annotation Exomiser")
 4639
 4640            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4641
 4642                # tmp_dir = "/tmp/exomiser"
 4643
 4644                ### ANALYSIS ###
 4645                ################
 4646
 4647                # Create analysis.json through analysis dict
 4648                # either analysis in param or by default
 4649                # depending on preset exome/genome)
 4650
 4651                # Init analysis dict
 4652                param_exomiser_analysis_dict = {}
 4653
 4654                # analysis from param
 4655                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4656                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4657
 4658                # If analysis in param -> load anlaysis json
 4659                if param_exomiser_analysis:
 4660
 4661                    # If param analysis is a file and exists
 4662                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4663                        param_exomiser_analysis
 4664                    ):
 4665                        # Load analysis file into analysis dict (either yaml or json)
 4666                        with open(param_exomiser_analysis) as json_file:
 4667                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4668
 4669                    # If param analysis is a dict
 4670                    elif isinstance(param_exomiser_analysis, dict):
 4671                        # Load analysis dict into analysis dict (either yaml or json)
 4672                        param_exomiser_analysis_dict = param_exomiser_analysis
 4673
 4674                    # Error analysis type
 4675                    else:
 4676                        log.error(f"Analysis type unknown. Check param file.")
 4677                        raise ValueError(f"Analysis type unknown. Check param file.")
 4678
 4679                # Case no input analysis config file/dict
 4680                # Use preset (exome/genome) to open default config file
 4681                if not param_exomiser_analysis_dict:
 4682
 4683                    # default preset
 4684                    default_preset = "exome"
 4685
 4686                    # Get param preset or default preset
 4687                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4688
 4689                    # Try to find if preset is a file
 4690                    if os.path.exists(param_exomiser_preset):
 4691                        # Preset file is provided in full path
 4692                        param_exomiser_analysis_default_config_file = (
 4693                            param_exomiser_preset
 4694                        )
 4695                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4696                    #     # Preset file is provided in full path
 4697                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4698                    elif os.path.exists(
 4699                        os.path.join(folder_config, param_exomiser_preset)
 4700                    ):
 4701                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4702                        param_exomiser_analysis_default_config_file = os.path.join(
 4703                            folder_config, param_exomiser_preset
 4704                        )
 4705                    else:
 4706                        # Construct preset file
 4707                        param_exomiser_analysis_default_config_file = os.path.join(
 4708                            folder_config,
 4709                            f"preset-{param_exomiser_preset}-analysis.json",
 4710                        )
 4711
 4712                    # If preset file exists
 4713                    param_exomiser_analysis_default_config_file = full_path(
 4714                        param_exomiser_analysis_default_config_file
 4715                    )
 4716                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4717                        # Load prest file into analysis dict (either yaml or json)
 4718                        with open(
 4719                            param_exomiser_analysis_default_config_file
 4720                        ) as json_file:
 4721                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4722                                json_file
 4723                            )
 4724
 4725                    # Error preset file
 4726                    else:
 4727                        log.error(
 4728                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4729                        )
 4730                        raise ValueError(
 4731                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4732                        )
 4733
 4734                # If no analysis dict created
 4735                if not param_exomiser_analysis_dict:
 4736                    log.error(f"No analysis config")
 4737                    raise ValueError(f"No analysis config")
 4738
 4739                # Log
 4740                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4741
 4742                ### PHENOPACKET ###
 4743                ###################
 4744
 4745                # If no PhenoPacket in analysis dict -> check in param
 4746                if "phenopacket" not in param_exomiser_analysis_dict:
 4747
 4748                    # If PhenoPacket in param -> load anlaysis json
 4749                    if param_exomiser.get("phenopacket", None):
 4750
 4751                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4752                        param_exomiser_phenopacket = full_path(
 4753                            param_exomiser_phenopacket
 4754                        )
 4755
 4756                        # If param phenopacket is a file and exists
 4757                        if isinstance(
 4758                            param_exomiser_phenopacket, str
 4759                        ) and os.path.exists(param_exomiser_phenopacket):
 4760                            # Load phenopacket file into analysis dict (either yaml or json)
 4761                            with open(param_exomiser_phenopacket) as json_file:
 4762                                param_exomiser_analysis_dict["phenopacket"] = (
 4763                                    yaml.safe_load(json_file)
 4764                                )
 4765
 4766                        # If param phenopacket is a dict
 4767                        elif isinstance(param_exomiser_phenopacket, dict):
 4768                            # Load phenopacket dict into analysis dict (either yaml or json)
 4769                            param_exomiser_analysis_dict["phenopacket"] = (
 4770                                param_exomiser_phenopacket
 4771                            )
 4772
 4773                        # Error phenopacket type
 4774                        else:
 4775                            log.error(f"Phenopacket type unknown. Check param file.")
 4776                            raise ValueError(
 4777                                f"Phenopacket type unknown. Check param file."
 4778                            )
 4779
 4780                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4781                if "phenopacket" not in param_exomiser_analysis_dict:
 4782
 4783                    # Init PhenoPacket
 4784                    param_exomiser_analysis_dict["phenopacket"] = {
 4785                        "id": "analysis",
 4786                        "proband": {},
 4787                    }
 4788
 4789                    ### Add subject ###
 4790
 4791                    # If subject exists
 4792                    param_exomiser_subject = param_exomiser.get("subject", {})
 4793
 4794                    # If subject not exists -> found sample ID
 4795                    if not param_exomiser_subject:
 4796
 4797                        # Found sample ID in param
 4798                        sample = param_exomiser.get("sample", None)
 4799
 4800                        # Find sample ID (first sample)
 4801                        if not sample:
 4802                            sample_list = self.get_header_sample_list()
 4803                            if len(sample_list) > 0:
 4804                                sample = sample_list[0]
 4805                            else:
 4806                                log.error(f"No sample found")
 4807                                raise ValueError(f"No sample found")
 4808
 4809                        # Create subject
 4810                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4811
 4812                    # Add to dict
 4813                    param_exomiser_analysis_dict["phenopacket"][
 4814                        "subject"
 4815                    ] = param_exomiser_subject
 4816
 4817                    ### Add "phenotypicFeatures" ###
 4818
 4819                    # If phenotypicFeatures exists
 4820                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4821                        "phenotypicFeatures", []
 4822                    )
 4823
 4824                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4825                    if not param_exomiser_phenotypicfeatures:
 4826
 4827                        # Found HPO in param
 4828                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4829
 4830                        # Split HPO if list in string format separated by comma
 4831                        if isinstance(param_exomiser_hpo, str):
 4832                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4833
 4834                        # Create HPO list
 4835                        for hpo in param_exomiser_hpo:
 4836                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4837                            param_exomiser_phenotypicfeatures.append(
 4838                                {
 4839                                    "type": {
 4840                                        "id": f"HP:{hpo_clean}",
 4841                                        "label": f"HP:{hpo_clean}",
 4842                                    }
 4843                                }
 4844                            )
 4845
 4846                    # Add to dict
 4847                    param_exomiser_analysis_dict["phenopacket"][
 4848                        "phenotypicFeatures"
 4849                    ] = param_exomiser_phenotypicfeatures
 4850
 4851                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4852                    if not param_exomiser_phenotypicfeatures:
 4853                        for step in param_exomiser_analysis_dict.get(
 4854                            "analysis", {}
 4855                        ).get("steps", []):
 4856                            if "hiPhivePrioritiser" in step:
 4857                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4858                                    "steps", []
 4859                                ).remove(step)
 4860
 4861                ### Add Input File ###
 4862
 4863                # Initial file name and htsFiles
 4864                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4865                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4866                    {
 4867                        "uri": tmp_vcf_name,
 4868                        "htsFormat": "VCF",
 4869                        "genomeAssembly": assembly,
 4870                    }
 4871                ]
 4872
 4873                ### Add metaData ###
 4874
 4875                # If metaData not in analysis dict
 4876                if "metaData" not in param_exomiser_analysis_dict:
 4877                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4878                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4879                        "createdBy": "howard",
 4880                        "phenopacketSchemaVersion": 1,
 4881                    }
 4882
 4883                ### OutputOptions ###
 4884
 4885                # Init output result folder
 4886                output_results = os.path.join(tmp_dir, "results")
 4887
 4888                # If no outputOptions in analysis dict
 4889                if "outputOptions" not in param_exomiser_analysis_dict:
 4890
 4891                    # default output formats
 4892                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4893
 4894                    # Get outputOptions in param
 4895                    output_options = param_exomiser.get("outputOptions", None)
 4896
 4897                    # If no output_options in param -> check
 4898                    if not output_options:
 4899                        output_options = {
 4900                            "outputContributingVariantsOnly": False,
 4901                            "numGenes": 0,
 4902                            "outputFormats": defaut_output_formats,
 4903                        }
 4904
 4905                    # Replace outputDirectory in output options
 4906                    output_options["outputDirectory"] = output_results
 4907                    output_options["outputFileName"] = "howard"
 4908
 4909                    # Add outputOptions in analysis dict
 4910                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4911
 4912                else:
 4913
 4914                    # Replace output_results and output format (if exists in param)
 4915                    param_exomiser_analysis_dict["outputOptions"][
 4916                        "outputDirectory"
 4917                    ] = output_results
 4918                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4919                        list(
 4920                            set(
 4921                                param_exomiser_analysis_dict.get(
 4922                                    "outputOptions", {}
 4923                                ).get("outputFormats", [])
 4924                                + ["TSV_VARIANT", "VCF"]
 4925                            )
 4926                        )
 4927                    )
 4928
 4929                # log
 4930                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4931
 4932                ### ANALYSIS FILE ###
 4933                #####################
 4934
 4935                ### Full JSON analysis config file ###
 4936
 4937                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4938                with open(exomiser_analysis, "w") as fp:
 4939                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4940
 4941                ### SPLIT analysis and sample config files
 4942
 4943                # Splitted analysis dict
 4944                param_exomiser_analysis_dict_for_split = (
 4945                    param_exomiser_analysis_dict.copy()
 4946                )
 4947
 4948                # Phenopacket JSON file
 4949                exomiser_analysis_phenopacket = os.path.join(
 4950                    tmp_dir, "analysis_phenopacket.json"
 4951                )
 4952                with open(exomiser_analysis_phenopacket, "w") as fp:
 4953                    json.dump(
 4954                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4955                        fp,
 4956                        indent=4,
 4957                    )
 4958
 4959                # Analysis JSON file without Phenopacket parameters
 4960                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4961                exomiser_analysis_analysis = os.path.join(
 4962                    tmp_dir, "analysis_analysis.json"
 4963                )
 4964                with open(exomiser_analysis_analysis, "w") as fp:
 4965                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4966
 4967                ### INITAL VCF file ###
 4968                #######################
 4969
 4970                ### Create list of samples to use and include inti initial VCF file ####
 4971
 4972                # Subject (main sample)
 4973                # Get sample ID in analysis dict
 4974                sample_subject = (
 4975                    param_exomiser_analysis_dict.get("phenopacket", {})
 4976                    .get("subject", {})
 4977                    .get("id", None)
 4978                )
 4979                sample_proband = (
 4980                    param_exomiser_analysis_dict.get("phenopacket", {})
 4981                    .get("proband", {})
 4982                    .get("subject", {})
 4983                    .get("id", None)
 4984                )
 4985                sample = []
 4986                if sample_subject:
 4987                    sample.append(sample_subject)
 4988                if sample_proband:
 4989                    sample.append(sample_proband)
 4990
 4991                # Get sample ID within Pedigree
 4992                pedigree_persons_list = (
 4993                    param_exomiser_analysis_dict.get("phenopacket", {})
 4994                    .get("pedigree", {})
 4995                    .get("persons", {})
 4996                )
 4997
 4998                # Create list with all sample ID in pedigree (if exists)
 4999                pedigree_persons = []
 5000                for person in pedigree_persons_list:
 5001                    pedigree_persons.append(person.get("individualId"))
 5002
 5003                # Concat subject sample ID and samples ID in pedigreesamples
 5004                samples = list(set(sample + pedigree_persons))
 5005
 5006                # Check if sample list is not empty
 5007                if not samples:
 5008                    log.error(f"No samples found")
 5009                    raise ValueError(f"No samples found")
 5010
 5011                # Create VCF with sample (either sample in param or first one by default)
 5012                # Export VCF file
 5013                self.export_variant_vcf(
 5014                    vcf_file=tmp_vcf_name,
 5015                    remove_info=True,
 5016                    add_samples=True,
 5017                    list_samples=samples,
 5018                    index=False,
 5019                )
 5020
 5021                ### Execute Exomiser ###
 5022                ########################
 5023
 5024                # Init command
 5025                exomiser_command = ""
 5026
 5027                # Command exomiser options
 5028                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5029
 5030                # Release
 5031                exomiser_release = param_exomiser.get("release", None)
 5032                if exomiser_release:
 5033                    # phenotype data version
 5034                    exomiser_options += (
 5035                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5036                    )
 5037                    # data version
 5038                    exomiser_options += (
 5039                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5040                    )
 5041                    # variant white list
 5042                    variant_white_list_file = (
 5043                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5044                    )
 5045                    if os.path.exists(
 5046                        os.path.join(
 5047                            databases_folders, assembly, variant_white_list_file
 5048                        )
 5049                    ):
 5050                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5051
 5052                # transcript_source
 5053                transcript_source = param_exomiser.get(
 5054                    "transcript_source", None
 5055                )  # ucsc, refseq, ensembl
 5056                if transcript_source:
 5057                    exomiser_options += (
 5058                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5059                    )
 5060
 5061                # If analysis contain proband param
 5062                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5063                    "proband", {}
 5064                ):
 5065                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5066
 5067                # If no proband (usually uniq sample)
 5068                else:
 5069                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5070
 5071                # Log
 5072                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5073
 5074                # Run command
 5075                result = subprocess.call(
 5076                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5077                )
 5078                if result:
 5079                    log.error("Exomiser command failed")
 5080                    raise ValueError("Exomiser command failed")
 5081
 5082                ### RESULTS ###
 5083                ###############
 5084
 5085                ### Annotate with TSV fields ###
 5086
 5087                # Init result tsv file
 5088                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5089
 5090                # Init result tsv file
 5091                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5092
 5093                # Parse TSV file and explode columns in INFO field
 5094                if exomiser_to_info and os.path.exists(output_results_tsv):
 5095
 5096                    # Log
 5097                    log.debug("Exomiser columns to VCF INFO field")
 5098
 5099                    # Retrieve columns and types
 5100                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5101                    output_results_tsv_df = self.get_query_to_df(query)
 5102                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5103
 5104                    # Init concat fields for update
 5105                    sql_query_update_concat_fields = []
 5106
 5107                    # Fields to avoid
 5108                    fields_to_avoid = [
 5109                        "CONTIG",
 5110                        "START",
 5111                        "END",
 5112                        "REF",
 5113                        "ALT",
 5114                        "QUAL",
 5115                        "FILTER",
 5116                        "GENOTYPE",
 5117                    ]
 5118
 5119                    # List all columns to add into header
 5120                    for header_column in output_results_tsv_columns:
 5121
 5122                        # If header column is enable
 5123                        if header_column not in fields_to_avoid:
 5124
 5125                            # Header info type
 5126                            header_info_type = "String"
 5127                            header_column_df = output_results_tsv_df[header_column]
 5128                            header_column_df_dtype = header_column_df.dtype
 5129                            if header_column_df_dtype == object:
 5130                                if (
 5131                                    pd.to_numeric(header_column_df, errors="coerce")
 5132                                    .notnull()
 5133                                    .all()
 5134                                ):
 5135                                    header_info_type = "Float"
 5136                            else:
 5137                                header_info_type = "Integer"
 5138
 5139                            # Header info
 5140                            characters_to_validate = ["-"]
 5141                            pattern = "[" + "".join(characters_to_validate) + "]"
 5142                            header_info_name = re.sub(
 5143                                pattern,
 5144                                "_",
 5145                                f"Exomiser_{header_column}".replace("#", ""),
 5146                            )
 5147                            header_info_number = "."
 5148                            header_info_description = (
 5149                                f"Exomiser {header_column} annotation"
 5150                            )
 5151                            header_info_source = "Exomiser"
 5152                            header_info_version = "unknown"
 5153                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5154                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5155                                header_info_name,
 5156                                header_info_number,
 5157                                header_info_type,
 5158                                header_info_description,
 5159                                header_info_source,
 5160                                header_info_version,
 5161                                header_info_code,
 5162                            )
 5163
 5164                            # Add field to add for update to concat fields
 5165                            sql_query_update_concat_fields.append(
 5166                                f"""
 5167                                CASE
 5168                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5169                                    THEN concat(
 5170                                        '{header_info_name}=',
 5171                                        table_parquet."{header_column}",
 5172                                        ';'
 5173                                        )
 5174
 5175                                    ELSE ''
 5176                                END
 5177                            """
 5178                            )
 5179
 5180                    # Update query
 5181                    sql_query_update = f"""
 5182                        UPDATE {table_variants} as table_variants
 5183                            SET INFO = concat(
 5184                                            CASE
 5185                                                WHEN INFO NOT IN ('', '.')
 5186                                                THEN INFO
 5187                                                ELSE ''
 5188                                            END,
 5189                                            CASE
 5190                                                WHEN table_variants.INFO NOT IN ('','.')
 5191                                                THEN ';'
 5192                                                ELSE ''
 5193                                            END,
 5194                                            (
 5195                                            SELECT 
 5196                                                concat(
 5197                                                    {",".join(sql_query_update_concat_fields)}
 5198                                                )
 5199                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5200                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5201                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5202                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5203                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5204                                            )
 5205                                        )
 5206                            ;
 5207                        """
 5208
 5209                    # Update
 5210                    self.conn.execute(sql_query_update)
 5211
 5212                ### Annotate with VCF INFO field ###
 5213
 5214                # Init result VCF file
 5215                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5216
 5217                # If VCF exists
 5218                if os.path.exists(output_results_vcf):
 5219
 5220                    # Log
 5221                    log.debug("Exomiser result VCF update variants")
 5222
 5223                    # Find Exomiser INFO field annotation in header
 5224                    with gzip.open(output_results_vcf, "rt") as f:
 5225                        header_list = self.read_vcf_header(f)
 5226                    exomiser_vcf_header = vcf.Reader(
 5227                        io.StringIO("\n".join(header_list))
 5228                    )
 5229
 5230                    # Add annotation INFO field to header
 5231                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5232
 5233                    # Update variants with VCF
 5234                    self.update_from_vcf(output_results_vcf)
 5235
 5236        return True
 5237
 5238    def annotation_snpeff(self, threads: int = None) -> None:
 5239        """
 5240        This function annotate with snpEff
 5241
 5242        :param threads: The number of threads to use
 5243        :return: the value of the variable "return_value".
 5244        """
 5245
 5246        # DEBUG
 5247        log.debug("Start annotation with snpeff databases")
 5248
 5249        # Threads
 5250        if not threads:
 5251            threads = self.get_threads()
 5252        log.debug("Threads: " + str(threads))
 5253
 5254        # DEBUG
 5255        delete_tmp = True
 5256        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5257            delete_tmp = False
 5258            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5259
 5260        # Config
 5261        config = self.get_config()
 5262        log.debug("Config: " + str(config))
 5263
 5264        # Config - Folders - Databases
 5265        databases_folders = (
 5266            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5267        )
 5268        log.debug("Databases annotations: " + str(databases_folders))
 5269
 5270        # Config - snpEff bin command
 5271        snpeff_bin_command = get_bin_command(
 5272            bin="snpEff.jar",
 5273            tool="snpeff",
 5274            bin_type="jar",
 5275            config=config,
 5276            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5277        )
 5278        if not snpeff_bin_command:
 5279            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5280            log.error(msg_err)
 5281            raise ValueError(msg_err)
 5282
 5283        # Config - snpEff databases
 5284        snpeff_databases = (
 5285            config.get("folders", {})
 5286            .get("databases", {})
 5287            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5288        )
 5289        snpeff_databases = full_path(snpeff_databases)
 5290        if snpeff_databases is not None and snpeff_databases != "":
 5291            log.debug(f"Create snpEff databases folder")
 5292            if not os.path.exists(snpeff_databases):
 5293                os.makedirs(snpeff_databases)
 5294
 5295        # Param
 5296        param = self.get_param()
 5297        log.debug("Param: " + str(param))
 5298
 5299        # Param
 5300        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5301        log.debug("Options: " + str(options))
 5302
 5303        # Param - Assembly
 5304        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5305
 5306        # Param - Options
 5307        snpeff_options = (
 5308            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5309        )
 5310        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5311        snpeff_csvstats = (
 5312            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5313        )
 5314        if snpeff_stats:
 5315            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5316            snpeff_stats = full_path(snpeff_stats)
 5317            snpeff_options += f" -stats {snpeff_stats}"
 5318        if snpeff_csvstats:
 5319            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5320            snpeff_csvstats = full_path(snpeff_csvstats)
 5321            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5322
 5323        # Data
 5324        table_variants = self.get_table_variants()
 5325
 5326        # Check if not empty
 5327        log.debug("Check if not empty")
 5328        sql_query_chromosomes = (
 5329            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5330        )
 5331        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5332        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5333            log.info(f"VCF empty")
 5334            return
 5335
 5336        # Export in VCF
 5337        log.debug("Create initial file to annotate")
 5338        tmp_vcf = NamedTemporaryFile(
 5339            prefix=self.get_prefix(),
 5340            dir=self.get_tmp_dir(),
 5341            suffix=".vcf.gz",
 5342            delete=True,
 5343        )
 5344        tmp_vcf_name = tmp_vcf.name
 5345
 5346        # VCF header
 5347        vcf_reader = self.get_header()
 5348        log.debug("Initial header: " + str(vcf_reader.infos))
 5349
 5350        # Existing annotations
 5351        for vcf_annotation in self.get_header().infos:
 5352
 5353            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5354            log.debug(
 5355                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5356            )
 5357
 5358        # Memory limit
 5359        # if config.get("memory", None):
 5360        #     memory_limit = config.get("memory", "8G")
 5361        # else:
 5362        #     memory_limit = "8G"
 5363        memory_limit = self.get_memory("8G")
 5364        log.debug(f"memory_limit: {memory_limit}")
 5365
 5366        # snpEff java options
 5367        snpeff_java_options = (
 5368            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5369        )
 5370        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5371
 5372        force_update_annotation = True
 5373
 5374        if "ANN" not in self.get_header().infos or force_update_annotation:
 5375
 5376            # Check snpEff database
 5377            log.debug(f"Check snpEff databases {[assembly]}")
 5378            databases_download_snpeff(
 5379                folder=snpeff_databases, assemblies=[assembly], config=config
 5380            )
 5381
 5382            # Export VCF file
 5383            self.export_variant_vcf(
 5384                vcf_file=tmp_vcf_name,
 5385                remove_info=True,
 5386                add_samples=False,
 5387                index=True,
 5388            )
 5389
 5390            # Tmp file
 5391            err_files = []
 5392            tmp_annotate_vcf = NamedTemporaryFile(
 5393                prefix=self.get_prefix(),
 5394                dir=self.get_tmp_dir(),
 5395                suffix=".vcf",
 5396                delete=False,
 5397            )
 5398            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5399            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5400            err_files.append(tmp_annotate_vcf_name_err)
 5401
 5402            # Command
 5403            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5404            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5405            run_parallel_commands([snpeff_command], 1)
 5406
 5407            # Error messages
 5408            log.info(f"Error/Warning messages:")
 5409            error_message_command_all = []
 5410            error_message_command_warning = []
 5411            error_message_command_err = []
 5412            for err_file in err_files:
 5413                with open(err_file, "r") as f:
 5414                    for line in f:
 5415                        message = line.strip()
 5416                        error_message_command_all.append(message)
 5417                        if line.startswith("[W::"):
 5418                            error_message_command_warning.append(message)
 5419                        if line.startswith("[E::"):
 5420                            error_message_command_err.append(f"{err_file}: " + message)
 5421            # log info
 5422            for message in list(
 5423                set(error_message_command_err + error_message_command_warning)
 5424            ):
 5425                log.info(f"   {message}")
 5426            # debug info
 5427            for message in list(set(error_message_command_all)):
 5428                log.debug(f"   {message}")
 5429            # failed
 5430            if len(error_message_command_err):
 5431                log.error("Annotation failed: Error in commands")
 5432                raise ValueError("Annotation failed: Error in commands")
 5433
 5434            # Find annotation in header
 5435            with open(tmp_annotate_vcf_name, "rt") as f:
 5436                header_list = self.read_vcf_header(f)
 5437            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5438
 5439            for ann in annovar_vcf_header.infos:
 5440                if ann not in self.get_header().infos:
 5441                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5442
 5443            # Update variants
 5444            log.info(f"Annotation - Updating...")
 5445            self.update_from_vcf(tmp_annotate_vcf_name)
 5446
 5447        else:
 5448            if "ANN" in self.get_header().infos:
 5449                log.debug(f"Existing snpEff annotations in VCF")
 5450            if force_update_annotation:
 5451                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5452
 5453    def annotation_annovar(self, threads: int = None) -> None:
 5454        """
 5455        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5456        annotations
 5457
 5458        :param threads: number of threads to use
 5459        :return: the value of the variable "return_value".
 5460        """
 5461
 5462        # DEBUG
 5463        log.debug("Start annotation with Annovar databases")
 5464
 5465        # Threads
 5466        if not threads:
 5467            threads = self.get_threads()
 5468        log.debug("Threads: " + str(threads))
 5469
 5470        # Tmp en Err files
 5471        tmp_files = []
 5472        err_files = []
 5473
 5474        # DEBUG
 5475        delete_tmp = True
 5476        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5477            delete_tmp = False
 5478            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5479
 5480        # Config
 5481        config = self.get_config()
 5482        log.debug("Config: " + str(config))
 5483
 5484        # Config - Folders - Databases
 5485        databases_folders = (
 5486            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5487        )
 5488        log.debug("Databases annotations: " + str(databases_folders))
 5489
 5490        # Config - annovar bin command
 5491        annovar_bin_command = get_bin_command(
 5492            bin="table_annovar.pl",
 5493            tool="annovar",
 5494            bin_type="perl",
 5495            config=config,
 5496            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5497        )
 5498        if not annovar_bin_command:
 5499            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5500            log.error(msg_err)
 5501            raise ValueError(msg_err)
 5502
 5503        # Config - BCFTools bin command
 5504        bcftools_bin_command = get_bin_command(
 5505            bin="bcftools",
 5506            tool="bcftools",
 5507            bin_type="bin",
 5508            config=config,
 5509            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5510        )
 5511        if not bcftools_bin_command:
 5512            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5513            log.error(msg_err)
 5514            raise ValueError(msg_err)
 5515
 5516        # Config - annovar databases
 5517        annovar_databases = (
 5518            config.get("folders", {})
 5519            .get("databases", {})
 5520            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5521        )
 5522        if annovar_databases is not None:
 5523            if isinstance(annovar_databases, list):
 5524                annovar_databases = full_path(annovar_databases[0])
 5525                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5526            annovar_databases = full_path(annovar_databases)
 5527            if not os.path.exists(annovar_databases):
 5528                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5529                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5530        else:
 5531            msg_err = f"Annovar databases configuration failed"
 5532            log.error(msg_err)
 5533            raise ValueError(msg_err)
 5534
 5535        # Param
 5536        param = self.get_param()
 5537        log.debug("Param: " + str(param))
 5538
 5539        # Param - options
 5540        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5541        log.debug("Options: " + str(options))
 5542
 5543        # Param - annotations
 5544        annotations = (
 5545            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5546        )
 5547        log.debug("Annotations: " + str(annotations))
 5548
 5549        # Param - Assembly
 5550        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5551
 5552        # Annovar database assembly
 5553        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5554        if annovar_databases_assembly != "" and not os.path.exists(
 5555            annovar_databases_assembly
 5556        ):
 5557            os.makedirs(annovar_databases_assembly)
 5558
 5559        # Data
 5560        table_variants = self.get_table_variants()
 5561
 5562        # Check if not empty
 5563        log.debug("Check if not empty")
 5564        sql_query_chromosomes = (
 5565            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5566        )
 5567        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5568        if not sql_query_chromosomes_df["count"][0]:
 5569            log.info(f"VCF empty")
 5570            return
 5571
 5572        # VCF header
 5573        vcf_reader = self.get_header()
 5574        log.debug("Initial header: " + str(vcf_reader.infos))
 5575
 5576        # Existing annotations
 5577        for vcf_annotation in self.get_header().infos:
 5578
 5579            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5580            log.debug(
 5581                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5582            )
 5583
 5584        force_update_annotation = True
 5585
 5586        if annotations:
 5587
 5588            commands = []
 5589            tmp_annotates_vcf_name_list = []
 5590
 5591            # Export in VCF
 5592            log.debug("Create initial file to annotate")
 5593            tmp_vcf = NamedTemporaryFile(
 5594                prefix=self.get_prefix(),
 5595                dir=self.get_tmp_dir(),
 5596                suffix=".vcf.gz",
 5597                delete=False,
 5598            )
 5599            tmp_vcf_name = tmp_vcf.name
 5600            tmp_files.append(tmp_vcf_name)
 5601            tmp_files.append(tmp_vcf_name + ".tbi")
 5602
 5603            # Export VCF file
 5604            self.export_variant_vcf(
 5605                vcf_file=tmp_vcf_name,
 5606                remove_info=".",
 5607                add_samples=False,
 5608                index=True,
 5609            )
 5610
 5611            # Create file for field rename
 5612            log.debug("Create file for field rename")
 5613            tmp_rename = NamedTemporaryFile(
 5614                prefix=self.get_prefix(),
 5615                dir=self.get_tmp_dir(),
 5616                suffix=".rename",
 5617                delete=False,
 5618            )
 5619            tmp_rename_name = tmp_rename.name
 5620            tmp_files.append(tmp_rename_name)
 5621
 5622            # Check Annovar database
 5623            log.debug(
 5624                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5625            )
 5626            databases_download_annovar(
 5627                folder=annovar_databases,
 5628                files=list(annotations.keys()),
 5629                assemblies=[assembly],
 5630            )
 5631
 5632            for annotation in annotations:
 5633                annotation_fields = annotations[annotation]
 5634
 5635                if not annotation_fields:
 5636                    annotation_fields = {"INFO": None}
 5637
 5638                log.info(f"Annotations Annovar - database '{annotation}'")
 5639                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5640
 5641                # Tmp file for annovar
 5642                err_files = []
 5643                tmp_annotate_vcf_directory = TemporaryDirectory(
 5644                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5645                )
 5646                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5647                tmp_annotate_vcf_name_annovar = (
 5648                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5649                )
 5650                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5651                err_files.append(tmp_annotate_vcf_name_err)
 5652                tmp_files.append(tmp_annotate_vcf_name_err)
 5653
 5654                # Tmp file final vcf annotated by annovar
 5655                tmp_annotate_vcf = NamedTemporaryFile(
 5656                    prefix=self.get_prefix(),
 5657                    dir=self.get_tmp_dir(),
 5658                    suffix=".vcf.gz",
 5659                    delete=False,
 5660                )
 5661                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5662                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5663                tmp_files.append(tmp_annotate_vcf_name)
 5664                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5665
 5666                # Number of fields
 5667                annotation_list = []
 5668                annotation_renamed_list = []
 5669
 5670                for annotation_field in annotation_fields:
 5671
 5672                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5673                    annotation_fields_new_name = annotation_fields.get(
 5674                        annotation_field, annotation_field
 5675                    )
 5676                    if not annotation_fields_new_name:
 5677                        annotation_fields_new_name = annotation_field
 5678
 5679                    if (
 5680                        force_update_annotation
 5681                        or annotation_fields_new_name not in self.get_header().infos
 5682                    ):
 5683                        annotation_list.append(annotation_field)
 5684                        annotation_renamed_list.append(annotation_fields_new_name)
 5685                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5686                        log.warning(
 5687                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5688                        )
 5689
 5690                    # Add rename info
 5691                    run_parallel_commands(
 5692                        [
 5693                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5694                        ],
 5695                        1,
 5696                    )
 5697
 5698                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5699                log.debug("annotation_list: " + str(annotation_list))
 5700
 5701                # protocol
 5702                protocol = annotation
 5703
 5704                # argument
 5705                argument = ""
 5706
 5707                # operation
 5708                operation = "f"
 5709                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5710                    "ensGene"
 5711                ):
 5712                    operation = "g"
 5713                    if options.get("genebase", None):
 5714                        argument = f"""'{options.get("genebase","")}'"""
 5715                elif annotation in ["cytoBand"]:
 5716                    operation = "r"
 5717
 5718                # argument option
 5719                argument_option = ""
 5720                if argument != "":
 5721                    argument_option = " --argument " + argument
 5722
 5723                # command options
 5724                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5725                for option in options:
 5726                    if option not in ["genebase"]:
 5727                        command_options += f""" --{option}={options[option]}"""
 5728
 5729                # Command
 5730
 5731                # Command - Annovar
 5732                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5733                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5734
 5735                # Command - start pipe
 5736                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5737
 5738                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5739                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5740
 5741                # Command - Special characters (refGene annotation)
 5742                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5743
 5744                # Command - Clean empty fields (with value ".")
 5745                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5746
 5747                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5748                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5749                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5750                    # for ann in annotation_renamed_list:
 5751                    for ann in annotation_list:
 5752                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5753
 5754                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5755
 5756                # Command - indexing
 5757                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5758
 5759                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5760                run_parallel_commands([command_annovar], 1)
 5761
 5762                # Error messages
 5763                log.info(f"Error/Warning messages:")
 5764                error_message_command_all = []
 5765                error_message_command_warning = []
 5766                error_message_command_err = []
 5767                for err_file in err_files:
 5768                    with open(err_file, "r") as f:
 5769                        for line in f:
 5770                            message = line.strip()
 5771                            error_message_command_all.append(message)
 5772                            if line.startswith("[W::") or line.startswith("WARNING"):
 5773                                error_message_command_warning.append(message)
 5774                            if line.startswith("[E::") or line.startswith("ERROR"):
 5775                                error_message_command_err.append(
 5776                                    f"{err_file}: " + message
 5777                                )
 5778                # log info
 5779                for message in list(
 5780                    set(error_message_command_err + error_message_command_warning)
 5781                ):
 5782                    log.info(f"   {message}")
 5783                # debug info
 5784                for message in list(set(error_message_command_all)):
 5785                    log.debug(f"   {message}")
 5786                # failed
 5787                if len(error_message_command_err):
 5788                    log.error("Annotation failed: Error in commands")
 5789                    raise ValueError("Annotation failed: Error in commands")
 5790
 5791            if tmp_annotates_vcf_name_list:
 5792
 5793                # List of annotated files
 5794                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5795
 5796                # Tmp file
 5797                tmp_annotate_vcf = NamedTemporaryFile(
 5798                    prefix=self.get_prefix(),
 5799                    dir=self.get_tmp_dir(),
 5800                    suffix=".vcf.gz",
 5801                    delete=False,
 5802                )
 5803                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5804                tmp_files.append(tmp_annotate_vcf_name)
 5805                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5806                err_files.append(tmp_annotate_vcf_name_err)
 5807                tmp_files.append(tmp_annotate_vcf_name_err)
 5808
 5809                # Command merge
 5810                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5811                log.info(
 5812                    f"Annotation Annovar - Annotation merging "
 5813                    + str(len(tmp_annotates_vcf_name_list))
 5814                    + " annotated files"
 5815                )
 5816                log.debug(f"Annotation - merge command: {merge_command}")
 5817                run_parallel_commands([merge_command], 1)
 5818
 5819                # Find annotation in header
 5820                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5821                    header_list = self.read_vcf_header(f)
 5822                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5823
 5824                for ann in annovar_vcf_header.infos:
 5825                    if ann not in self.get_header().infos:
 5826                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5827
 5828                # Update variants
 5829                log.info(f"Annotation Annovar - Updating...")
 5830                self.update_from_vcf(tmp_annotate_vcf_name)
 5831
 5832            # Clean files
 5833            # Tmp file remove command
 5834            if True:
 5835                tmp_files_remove_command = ""
 5836                if tmp_files:
 5837                    tmp_files_remove_command = " ".join(tmp_files)
 5838                clean_command = f" rm -f {tmp_files_remove_command} "
 5839                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5840                log.debug(f"Annotation - cleaning command: {clean_command}")
 5841                run_parallel_commands([clean_command], 1)
 5842
 5843    # Parquet
 5844    def annotation_parquet(self, threads: int = None) -> None:
 5845        """
 5846        It takes a VCF file, and annotates it with a parquet file
 5847
 5848        :param threads: number of threads to use for the annotation
 5849        :return: the value of the variable "result".
 5850        """
 5851
 5852        # DEBUG
 5853        log.debug("Start annotation with parquet databases")
 5854
 5855        # Threads
 5856        if not threads:
 5857            threads = self.get_threads()
 5858        log.debug("Threads: " + str(threads))
 5859
 5860        # DEBUG
 5861        delete_tmp = True
 5862        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5863            delete_tmp = False
 5864            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5865
 5866        # Config
 5867        databases_folders = set(
 5868            self.get_config()
 5869            .get("folders", {})
 5870            .get("databases", {})
 5871            .get("annotations", ["."])
 5872            + self.get_config()
 5873            .get("folders", {})
 5874            .get("databases", {})
 5875            .get("parquet", ["."])
 5876        )
 5877        log.debug("Databases annotations: " + str(databases_folders))
 5878
 5879        # Param
 5880        annotations = (
 5881            self.get_param()
 5882            .get("annotation", {})
 5883            .get("parquet", {})
 5884            .get("annotations", None)
 5885        )
 5886        log.debug("Annotations: " + str(annotations))
 5887
 5888        # Assembly
 5889        assembly = self.get_param().get(
 5890            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5891        )
 5892
 5893        # Force Update Annotation
 5894        force_update_annotation = (
 5895            self.get_param()
 5896            .get("annotation", {})
 5897            .get("options", {})
 5898            .get("annotations_update", False)
 5899        )
 5900        log.debug(f"force_update_annotation={force_update_annotation}")
 5901        force_append_annotation = (
 5902            self.get_param()
 5903            .get("annotation", {})
 5904            .get("options", {})
 5905            .get("annotations_append", False)
 5906        )
 5907        log.debug(f"force_append_annotation={force_append_annotation}")
 5908
 5909        # Data
 5910        table_variants = self.get_table_variants()
 5911
 5912        # Check if not empty
 5913        log.debug("Check if not empty")
 5914        sql_query_chromosomes_df = self.get_query_to_df(
 5915            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5916        )
 5917        if not sql_query_chromosomes_df["count"][0]:
 5918            log.info(f"VCF empty")
 5919            return
 5920
 5921        # VCF header
 5922        vcf_reader = self.get_header()
 5923        log.debug("Initial header: " + str(vcf_reader.infos))
 5924
 5925        # Nb Variants POS
 5926        log.debug("NB Variants Start")
 5927        nb_variants = self.conn.execute(
 5928            f"SELECT count(*) AS count FROM variants"
 5929        ).fetchdf()["count"][0]
 5930        log.debug("NB Variants Stop")
 5931
 5932        # Existing annotations
 5933        for vcf_annotation in self.get_header().infos:
 5934
 5935            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5936            log.debug(
 5937                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5938            )
 5939
 5940        # Added columns
 5941        added_columns = []
 5942
 5943        # drop indexes
 5944        log.debug(f"Drop indexes...")
 5945        self.drop_indexes()
 5946
 5947        if annotations:
 5948
 5949            if "ALL" in annotations:
 5950
 5951                all_param = annotations.get("ALL", {})
 5952                all_param_formats = all_param.get("formats", None)
 5953                all_param_releases = all_param.get("releases", None)
 5954
 5955                databases_infos_dict = self.scan_databases(
 5956                    database_formats=all_param_formats,
 5957                    database_releases=all_param_releases,
 5958                )
 5959                for database_infos in databases_infos_dict.keys():
 5960                    if database_infos not in annotations:
 5961                        annotations[database_infos] = {"INFO": None}
 5962
 5963            for annotation in annotations:
 5964
 5965                if annotation in ["ALL"]:
 5966                    continue
 5967
 5968                # Annotation Name
 5969                annotation_name = os.path.basename(annotation)
 5970
 5971                # Annotation fields
 5972                annotation_fields = annotations[annotation]
 5973                if not annotation_fields:
 5974                    annotation_fields = {"INFO": None}
 5975
 5976                log.debug(f"Annotation '{annotation_name}'")
 5977                log.debug(
 5978                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5979                )
 5980
 5981                # Create Database
 5982                database = Database(
 5983                    database=annotation,
 5984                    databases_folders=databases_folders,
 5985                    assembly=assembly,
 5986                )
 5987
 5988                # Find files
 5989                parquet_file = database.get_database()
 5990                parquet_hdr_file = database.get_header_file()
 5991                parquet_type = database.get_type()
 5992
 5993                # Check if files exists
 5994                if not parquet_file or not parquet_hdr_file:
 5995                    msg_err_list = []
 5996                    if not parquet_file:
 5997                        msg_err_list.append(
 5998                            f"Annotation failed: Annotation file not found"
 5999                        )
 6000                    if parquet_file and not parquet_hdr_file:
 6001                        msg_err_list.append(
 6002                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6003                        )
 6004
 6005                    log.error(". ".join(msg_err_list))
 6006                    raise ValueError(". ".join(msg_err_list))
 6007                else:
 6008                    # Get parquet connexion
 6009                    parquet_sql_attach = database.get_sql_database_attach(
 6010                        output="query"
 6011                    )
 6012                    if parquet_sql_attach:
 6013                        self.conn.execute(parquet_sql_attach)
 6014                    parquet_file_link = database.get_sql_database_link()
 6015                    # Log
 6016                    log.debug(
 6017                        f"Annotation '{annotation_name}' - file: "
 6018                        + str(parquet_file)
 6019                        + " and "
 6020                        + str(parquet_hdr_file)
 6021                    )
 6022
 6023                    # Database full header columns
 6024                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6025                        parquet_hdr_file
 6026                    )
 6027                    # Log
 6028                    log.debug(
 6029                        "Annotation database header columns : "
 6030                        + str(parquet_hdr_vcf_header_columns)
 6031                    )
 6032
 6033                    # Load header as VCF object
 6034                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6035                    # Log
 6036                    log.debug(
 6037                        "Annotation database header: "
 6038                        + str(parquet_hdr_vcf_header_infos)
 6039                    )
 6040
 6041                    # Get extra infos
 6042                    parquet_columns = database.get_extra_columns()
 6043                    # Log
 6044                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6045
 6046                    # Add extra columns if "ALL" in annotation_fields
 6047                    # if "ALL" in annotation_fields:
 6048                    #     allow_add_extra_column = True
 6049                    if "ALL" in annotation_fields and database.get_extra_columns():
 6050                        for extra_column in database.get_extra_columns():
 6051                            if (
 6052                                extra_column not in annotation_fields
 6053                                and extra_column.replace("INFO/", "")
 6054                                not in parquet_hdr_vcf_header_infos
 6055                            ):
 6056                                parquet_hdr_vcf_header_infos[extra_column] = (
 6057                                    vcf.parser._Info(
 6058                                        extra_column,
 6059                                        ".",
 6060                                        "String",
 6061                                        f"{extra_column} description",
 6062                                        "unknown",
 6063                                        "unknown",
 6064                                        self.code_type_map["String"],
 6065                                    )
 6066                                )
 6067
 6068                    # For all fields in database
 6069                    annotation_fields_all = False
 6070                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6071                        annotation_fields_all = True
 6072                        annotation_fields = {
 6073                            key: key for key in parquet_hdr_vcf_header_infos
 6074                        }
 6075
 6076                        log.debug(
 6077                            "Annotation database header - All annotations added: "
 6078                            + str(annotation_fields)
 6079                        )
 6080
 6081                    # Init
 6082
 6083                    # List of annotation fields to use
 6084                    sql_query_annotation_update_info_sets = []
 6085
 6086                    # List of annotation to agregate
 6087                    sql_query_annotation_to_agregate = []
 6088
 6089                    # Number of fields
 6090                    nb_annotation_field = 0
 6091
 6092                    # Annotation fields processed
 6093                    annotation_fields_processed = []
 6094
 6095                    # Columns mapping
 6096                    map_columns = database.map_columns(
 6097                        columns=annotation_fields, prefixes=["INFO/"]
 6098                    )
 6099
 6100                    # Query dict for fields to remove (update option)
 6101                    query_dict_remove = {}
 6102
 6103                    # Fetch Anotation fields
 6104                    for annotation_field in annotation_fields:
 6105
 6106                        # annotation_field_column
 6107                        annotation_field_column = map_columns.get(
 6108                            annotation_field, "INFO"
 6109                        )
 6110
 6111                        # field new name, if parametered
 6112                        annotation_fields_new_name = annotation_fields.get(
 6113                            annotation_field, annotation_field
 6114                        )
 6115                        if not annotation_fields_new_name:
 6116                            annotation_fields_new_name = annotation_field
 6117
 6118                        # To annotate
 6119                        # force_update_annotation = True
 6120                        # force_append_annotation = True
 6121                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6122                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6123                            force_update_annotation
 6124                            or force_append_annotation
 6125                            or (
 6126                                annotation_fields_new_name
 6127                                not in self.get_header().infos
 6128                            )
 6129                        ):
 6130
 6131                            # Add field to annotation to process list
 6132                            annotation_fields_processed.append(
 6133                                annotation_fields_new_name
 6134                            )
 6135
 6136                            # explode infos for the field
 6137                            annotation_fields_new_name_info_msg = ""
 6138                            if (
 6139                                force_update_annotation
 6140                                and annotation_fields_new_name
 6141                                in self.get_header().infos
 6142                            ):
 6143                                # Remove field from INFO
 6144                                query = f"""
 6145                                    UPDATE {table_variants} as table_variants
 6146                                    SET INFO = REGEXP_REPLACE(
 6147                                                concat(table_variants.INFO,''),
 6148                                                ';*{annotation_fields_new_name}=[^;]*',
 6149                                                ''
 6150                                                )
 6151                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6152                                """
 6153                                annotation_fields_new_name_info_msg = " [update]"
 6154                                query_dict_remove[
 6155                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6156                                ] = query
 6157
 6158                            # Sep between fields in INFO
 6159                            nb_annotation_field += 1
 6160                            if nb_annotation_field > 1:
 6161                                annotation_field_sep = ";"
 6162                            else:
 6163                                annotation_field_sep = ""
 6164
 6165                            log.info(
 6166                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6167                            )
 6168
 6169                            # Add INFO field to header
 6170                            parquet_hdr_vcf_header_infos_number = (
 6171                                parquet_hdr_vcf_header_infos[annotation_field].num
 6172                                or "."
 6173                            )
 6174                            parquet_hdr_vcf_header_infos_type = (
 6175                                parquet_hdr_vcf_header_infos[annotation_field].type
 6176                                or "String"
 6177                            )
 6178                            parquet_hdr_vcf_header_infos_description = (
 6179                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6180                                or f"{annotation_field} description"
 6181                            )
 6182                            parquet_hdr_vcf_header_infos_source = (
 6183                                parquet_hdr_vcf_header_infos[annotation_field].source
 6184                                or "unknown"
 6185                            )
 6186                            parquet_hdr_vcf_header_infos_version = (
 6187                                parquet_hdr_vcf_header_infos[annotation_field].version
 6188                                or "unknown"
 6189                            )
 6190
 6191                            vcf_reader.infos[annotation_fields_new_name] = (
 6192                                vcf.parser._Info(
 6193                                    annotation_fields_new_name,
 6194                                    parquet_hdr_vcf_header_infos_number,
 6195                                    parquet_hdr_vcf_header_infos_type,
 6196                                    parquet_hdr_vcf_header_infos_description,
 6197                                    parquet_hdr_vcf_header_infos_source,
 6198                                    parquet_hdr_vcf_header_infos_version,
 6199                                    self.code_type_map[
 6200                                        parquet_hdr_vcf_header_infos_type
 6201                                    ],
 6202                                )
 6203                            )
 6204
 6205                            # Append
 6206                            if force_append_annotation:
 6207                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6208                            else:
 6209                                query_case_when_append = ""
 6210
 6211                            # Annotation/Update query fields
 6212                            # Found in INFO column
 6213                            if (
 6214                                annotation_field_column == "INFO"
 6215                                and "INFO" in parquet_hdr_vcf_header_columns
 6216                            ):
 6217                                sql_query_annotation_update_info_sets.append(
 6218                                    f"""
 6219                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6220                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6221                                        ELSE ''
 6222                                    END
 6223                                """
 6224                                )
 6225                            # Found in a specific column
 6226                            else:
 6227                                sql_query_annotation_update_info_sets.append(
 6228                                    f"""
 6229                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6230                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6231                                        ELSE ''
 6232                                    END
 6233                                """
 6234                                )
 6235                                sql_query_annotation_to_agregate.append(
 6236                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6237                                )
 6238
 6239                        # Not to annotate
 6240                        else:
 6241
 6242                            if force_update_annotation:
 6243                                annotation_message = "forced"
 6244                            else:
 6245                                annotation_message = "skipped"
 6246
 6247                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6248                                log.warning(
 6249                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6250                                )
 6251                            if annotation_fields_new_name in self.get_header().infos:
 6252                                log.warning(
 6253                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6254                                )
 6255
 6256                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6257                    # allow_annotation_full_info = True
 6258                    allow_annotation_full_info = not force_append_annotation
 6259
 6260                    if parquet_type in ["regions"]:
 6261                        allow_annotation_full_info = False
 6262
 6263                    if (
 6264                        allow_annotation_full_info
 6265                        and nb_annotation_field == len(annotation_fields)
 6266                        and annotation_fields_all
 6267                        and (
 6268                            "INFO" in parquet_hdr_vcf_header_columns
 6269                            and "INFO" in database.get_extra_columns()
 6270                        )
 6271                    ):
 6272                        log.debug("Column INFO annotation enabled")
 6273                        sql_query_annotation_update_info_sets = []
 6274                        sql_query_annotation_update_info_sets.append(
 6275                            f" table_parquet.INFO "
 6276                        )
 6277
 6278                    if sql_query_annotation_update_info_sets:
 6279
 6280                        # Annotate
 6281                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6282
 6283                        # Join query annotation update info sets for SQL
 6284                        sql_query_annotation_update_info_sets_sql = ",".join(
 6285                            sql_query_annotation_update_info_sets
 6286                        )
 6287
 6288                        # Check chromosomes list (and variants infos)
 6289                        sql_query_chromosomes = f"""
 6290                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6291                            FROM {table_variants} as table_variants
 6292                            GROUP BY table_variants."#CHROM"
 6293                            ORDER BY table_variants."#CHROM"
 6294                            """
 6295                        sql_query_chromosomes_df = self.conn.execute(
 6296                            sql_query_chromosomes
 6297                        ).df()
 6298                        sql_query_chromosomes_dict = {
 6299                            entry["CHROM"]: {
 6300                                "count": entry["count_variants"],
 6301                                "min": entry["min_variants"],
 6302                                "max": entry["max_variants"],
 6303                            }
 6304                            for index, entry in sql_query_chromosomes_df.iterrows()
 6305                        }
 6306
 6307                        # Init
 6308                        nb_of_query = 0
 6309                        nb_of_variant_annotated = 0
 6310                        query_dict = query_dict_remove
 6311
 6312                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6313                        for chrom in sql_query_chromosomes_dict:
 6314
 6315                            # Number of variant by chromosome
 6316                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6317                                chrom, {}
 6318                            ).get("count", 0)
 6319
 6320                            log.debug(
 6321                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6322                            )
 6323
 6324                            # Annotation with regions database
 6325                            if parquet_type in ["regions"]:
 6326                                sql_query_annotation_from_clause = f"""
 6327                                    FROM (
 6328                                        SELECT 
 6329                                            '{chrom}' AS \"#CHROM\",
 6330                                            table_variants_from.\"POS\" AS \"POS\",
 6331                                            {",".join(sql_query_annotation_to_agregate)}
 6332                                        FROM {table_variants} as table_variants_from
 6333                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6334                                            table_parquet_from."#CHROM" = '{chrom}'
 6335                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6336                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6337                                        )
 6338                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6339                                        GROUP BY table_variants_from.\"POS\"
 6340                                        )
 6341                                        as table_parquet
 6342                                """
 6343
 6344                                sql_query_annotation_where_clause = """
 6345                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6346                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6347                                """
 6348
 6349                            # Annotation with variants database
 6350                            else:
 6351                                sql_query_annotation_from_clause = f"""
 6352                                    FROM {parquet_file_link} as table_parquet
 6353                                """
 6354                                sql_query_annotation_where_clause = f"""
 6355                                    table_variants."#CHROM" = '{chrom}'
 6356                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6357                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6358                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6359                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6360                                """
 6361
 6362                            # Create update query
 6363                            sql_query_annotation_chrom_interval_pos = f"""
 6364                                UPDATE {table_variants} as table_variants
 6365                                    SET INFO = 
 6366                                        concat(
 6367                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6368                                                THEN table_variants.INFO
 6369                                                ELSE ''
 6370                                            END
 6371                                            ,
 6372                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6373                                                        AND (
 6374                                                        concat({sql_query_annotation_update_info_sets_sql})
 6375                                                        )
 6376                                                        NOT IN ('','.') 
 6377                                                    THEN ';'
 6378                                                    ELSE ''
 6379                                            END
 6380                                            ,
 6381                                            {sql_query_annotation_update_info_sets_sql}
 6382                                            )
 6383                                    {sql_query_annotation_from_clause}
 6384                                    WHERE {sql_query_annotation_where_clause}
 6385                                    ;
 6386                                """
 6387
 6388                            # Add update query to dict
 6389                            query_dict[
 6390                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6391                            ] = sql_query_annotation_chrom_interval_pos
 6392
 6393                        nb_of_query = len(query_dict)
 6394                        num_query = 0
 6395
 6396                        # SET max_expression_depth TO x
 6397                        self.conn.execute("SET max_expression_depth TO 10000")
 6398
 6399                        for query_name in query_dict:
 6400                            query = query_dict[query_name]
 6401                            num_query += 1
 6402                            log.info(
 6403                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6404                            )
 6405                            result = self.conn.execute(query)
 6406                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6407                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6408                            log.info(
 6409                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6410                            )
 6411
 6412                        log.info(
 6413                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6414                        )
 6415
 6416                    else:
 6417
 6418                        log.info(
 6419                            f"Annotation '{annotation_name}' - No Annotations available"
 6420                        )
 6421
 6422                    log.debug("Final header: " + str(vcf_reader.infos))
 6423
 6424        # Remove added columns
 6425        for added_column in added_columns:
 6426            self.drop_column(column=added_column)
 6427
 6428    def annotation_splice(self, threads: int = None) -> None:
 6429        """
 6430        This function annotate with snpEff
 6431
 6432        :param threads: The number of threads to use
 6433        :return: the value of the variable "return_value".
 6434        """
 6435
 6436        # DEBUG
 6437        log.debug("Start annotation with splice tools")
 6438
 6439        # Threads
 6440        if not threads:
 6441            threads = self.get_threads()
 6442        log.debug("Threads: " + str(threads))
 6443
 6444        # DEBUG
 6445        delete_tmp = True
 6446        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6447            delete_tmp = False
 6448            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6449
 6450        # Config
 6451        config = self.get_config()
 6452        log.debug("Config: " + str(config))
 6453        splice_config = config.get("tools", {}).get("splice", {})
 6454        if not splice_config:
 6455            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6456            msg_err = "No Splice tool config"
 6457            raise ValueError(msg_err)
 6458        log.debug(f"splice_config: {splice_config}")
 6459
 6460        # Config - Folders - Databases
 6461        databases_folders = (
 6462            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6463        )
 6464        log.debug("Databases annotations: " + str(databases_folders))
 6465
 6466        # Splice docker image
 6467        splice_docker_image = splice_config.get("docker").get("image")
 6468
 6469        # Pull splice image if it's not already there
 6470        if not check_docker_image_exists(splice_docker_image):
 6471            log.warning(
 6472                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6473            )
 6474            try:
 6475                command(f"docker pull {splice_config.get('docker').get('image')}")
 6476            except subprocess.CalledProcessError:
 6477                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6478                log.error(msg_err)
 6479                raise ValueError(msg_err)
 6480
 6481        # Config - splice databases
 6482        splice_databases = (
 6483            config.get("folders", {})
 6484            .get("databases", {})
 6485            .get("splice", DEFAULT_SPLICE_FOLDER)
 6486        )
 6487        splice_databases = full_path(splice_databases)
 6488
 6489        # Param
 6490        param = self.get_param()
 6491        log.debug("Param: " + str(param))
 6492
 6493        # Param
 6494        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6495        log.debug("Options: " + str(options))
 6496
 6497        # Data
 6498        table_variants = self.get_table_variants()
 6499
 6500        # Check if not empty
 6501        log.debug("Check if not empty")
 6502        sql_query_chromosomes = (
 6503            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6504        )
 6505        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6506            log.info("VCF empty")
 6507            return None
 6508
 6509        # Export in VCF
 6510        log.debug("Create initial file to annotate")
 6511
 6512        # Create output folder / work folder
 6513        if options.get("output_folder", ""):
 6514            output_folder = options.get("output_folder", "")
 6515            if not os.path.exists(output_folder):
 6516                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6517        else:
 6518            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6519            if not os.path.exists(output_folder):
 6520                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6521
 6522        if options.get("workdir", ""):
 6523            workdir = options.get("workdir", "")
 6524        else:
 6525            workdir = "/work"
 6526
 6527        # Create tmp VCF file
 6528        tmp_vcf = NamedTemporaryFile(
 6529            prefix=self.get_prefix(),
 6530            dir=output_folder,
 6531            suffix=".vcf",
 6532            delete=False,
 6533        )
 6534        tmp_vcf_name = tmp_vcf.name
 6535
 6536        # VCF header
 6537        header = self.get_header()
 6538
 6539        # Existing annotations
 6540        for vcf_annotation in self.get_header().infos:
 6541
 6542            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6543            log.debug(
 6544                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6545            )
 6546
 6547        # Memory limit
 6548        if config.get("memory", None):
 6549            memory_limit = config.get("memory", "8G").upper()
 6550            # upper()
 6551        else:
 6552            memory_limit = "8G"
 6553        log.debug(f"memory_limit: {memory_limit}")
 6554
 6555        # Check number of variants to annotate
 6556        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6557        where_clause_regex_spip = r"SPiP_\w+"
 6558        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6559        df_list_of_variants_to_annotate = self.get_query_to_df(
 6560            query=f""" SELECT * FROM variants {where_clause} """
 6561        )
 6562        if len(df_list_of_variants_to_annotate) == 0:
 6563            log.warning(
 6564                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6565            )
 6566            return None
 6567        else:
 6568            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6569
 6570        # Export VCF file
 6571        self.export_variant_vcf(
 6572            vcf_file=tmp_vcf_name,
 6573            remove_info=True,
 6574            add_samples=True,
 6575            index=False,
 6576            where_clause=where_clause,
 6577        )
 6578        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6579        if any(value for value in splice_config.values() if value is None):
 6580            log.warning("At least one splice config parameter is empty")
 6581            # exit annotation_splice
 6582            return None
 6583
 6584        # Params in splice nf
 6585        def check_values(dico: dict):
 6586            """
 6587            Ensure parameters for NF splice pipeline
 6588            """
 6589            for key, val in dico.items():
 6590                if key == "genome":
 6591                    if any(
 6592                        assemb in options.get("genome", {})
 6593                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6594                    ):
 6595                        yield f"--{key} hg19"
 6596                    elif any(
 6597                        assemb in options.get("genome", {})
 6598                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6599                    ):
 6600                        yield f"--{key} hg38"
 6601                elif (
 6602                    (isinstance(val, str) and val)
 6603                    or isinstance(val, int)
 6604                    or isinstance(val, bool)
 6605                ):
 6606                    yield f"--{key} {val}"
 6607
 6608        # Genome
 6609        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6610        options["genome"] = genome
 6611        # NF params
 6612        nf_params = []
 6613        # Add options
 6614        if options:
 6615            log.debug(options)
 6616            nf_params = list(check_values(options))
 6617            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6618        else:
 6619            log.debug("No NF params provided")
 6620        # Add threads
 6621        if "threads" not in options.keys():
 6622            nf_params.append(f"--threads {threads}")
 6623        # Genome path
 6624        genome_path = find_genome(
 6625            config.get("folders", {})
 6626            .get("databases", {})
 6627            .get("genomes", DEFAULT_GENOME_FOLDER),
 6628            file=f"{genome}.fa",
 6629        )
 6630        # Add genome path
 6631        if not genome_path:
 6632            raise ValueError(
 6633                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6634            )
 6635        else:
 6636            log.debug(f"Genome: {genome_path}")
 6637            nf_params.append(f"--genome_path {genome_path}")
 6638
 6639        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6640            """
 6641            Setting up updated databases for SPiP and SpliceAI
 6642            """
 6643
 6644            try:
 6645
 6646                # SpliceAI assembly transcriptome
 6647                spliceai_assembly = os.path.join(
 6648                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6649                    options.get("genome"),
 6650                    "transcriptome",
 6651                )
 6652                spip_assembly = options.get("genome")
 6653
 6654                spip = find(
 6655                    f"transcriptome_{spip_assembly}.RData",
 6656                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6657                )
 6658                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6659                log.debug(f"SPiP annotations: {spip}")
 6660                log.debug(f"SpliceAI annotations: {spliceai}")
 6661                if spip and spliceai:
 6662                    return [
 6663                        f"--spip_transcriptome {spip}",
 6664                        f"--spliceai_transcriptome {spliceai}",
 6665                    ]
 6666                else:
 6667                    log.warning(
 6668                        "Can't find splice databases in configuration, use annotations file from image"
 6669                    )
 6670            except TypeError:
 6671                log.warning(
 6672                    "Can't find splice databases in configuration, use annotations file from image"
 6673                )
 6674                return []
 6675
 6676        # Add options, check if transcriptome option have already beend provided
 6677        if (
 6678            "spip_transcriptome" not in nf_params
 6679            and "spliceai_transcriptome" not in nf_params
 6680        ):
 6681            splice_reference = splice_annotations(options, config)
 6682            if splice_reference:
 6683                nf_params.extend(splice_reference)
 6684        # nf_params.append(f"--output_folder {output_folder}")
 6685        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6686        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6687        log.debug(cmd)
 6688        splice_config["docker"]["command"] = cmd
 6689
 6690        # Ensure proxy is set
 6691        proxy = [
 6692            f"-e {var}={os.getenv(var)}"
 6693            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6694            if os.getenv(var) is not None
 6695        ]
 6696        docker_cmd = get_bin_command(
 6697            tool="splice",
 6698            bin_type="docker",
 6699            config=config,
 6700            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6701            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6702        )
 6703        # print(docker_cmd)
 6704        # exit()
 6705        # Docker debug
 6706        # if splice_config.get("rm_container"):
 6707        #     rm_container = "--rm"
 6708        # else:
 6709        #     rm_container = ""
 6710        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6711        log.debug(docker_cmd)
 6712        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6713        log.debug(res.stdout)
 6714        if res.stderr:
 6715            log.error(res.stderr)
 6716        res.check_returncode()
 6717        # Update variants
 6718        log.info("Annotation - Updating...")
 6719        # Test find output vcf
 6720        log.debug(
 6721            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6722        )
 6723        output_vcf = []
 6724        # Wrong folder to look in
 6725        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6726            if (
 6727                files
 6728                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6729            ):
 6730                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6731        # log.debug(os.listdir(options.get("output_folder")))
 6732        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6733        if not output_vcf:
 6734            log.debug(
 6735                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6736            )
 6737        else:
 6738            # Get new header from annotated vcf
 6739            log.debug(f"Initial header: {len(header.infos)} fields")
 6740            # Create new header with splice infos
 6741            new_vcf = Variants(input=output_vcf[0])
 6742            new_vcf_header = new_vcf.get_header().infos
 6743            for keys, infos in new_vcf_header.items():
 6744                if keys not in header.infos.keys():
 6745                    header.infos[keys] = infos
 6746            log.debug(f"New header: {len(header.infos)} fields")
 6747            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6748            self.update_from_vcf(output_vcf[0])
 6749
 6750        # Remove file
 6751        remove_if_exists(output_vcf)
 6752
 6753    ###
 6754    # Prioritization
 6755    ###
 6756
 6757    def get_config_default(self, name: str) -> dict:
 6758        """
 6759        The function `get_config_default` returns a dictionary containing default configurations for
 6760        various calculations and prioritizations.
 6761
 6762        :param name: The `get_config_default` function returns a dictionary containing default
 6763        configurations for different calculations and prioritizations. The `name` parameter is used to
 6764        specify which specific configuration to retrieve from the dictionary
 6765        :type name: str
 6766        :return: The function `get_config_default` returns a dictionary containing default configuration
 6767        settings for different calculations and prioritizations. The specific configuration settings are
 6768        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6769        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6770        returned. If there is no match, an empty dictionary is returned.
 6771        """
 6772
 6773        config_default = {
 6774            "calculations": {
 6775                "variant_chr_pos_alt_ref": {
 6776                    "type": "sql",
 6777                    "name": "variant_chr_pos_alt_ref",
 6778                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6779                    "available": False,
 6780                    "output_column_name": "variant_chr_pos_alt_ref",
 6781                    "output_column_type": "String",
 6782                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6783                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6784                    "operation_info": True,
 6785                },
 6786                "VARTYPE": {
 6787                    "type": "sql",
 6788                    "name": "VARTYPE",
 6789                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6790                    "available": True,
 6791                    "table": "variants",
 6792                    "output_column_name": "VARTYPE",
 6793                    "output_column_type": "String",
 6794                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6795                    "operation_query": """
 6796                            CASE
 6797                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6798                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6799                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6800                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6801                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6802                                ELSE 'UNDEFINED'
 6803                            END
 6804                            """,
 6805                    "info_fields": ["SVTYPE"],
 6806                    "operation_info": True,
 6807                },
 6808                "snpeff_hgvs": {
 6809                    "type": "python",
 6810                    "name": "snpeff_hgvs",
 6811                    "description": "HGVS nomenclatures from snpEff annotation",
 6812                    "available": True,
 6813                    "function_name": "calculation_extract_snpeff_hgvs",
 6814                    "function_params": ["snpeff_hgvs", "ANN"],
 6815                },
 6816                "snpeff_ann_explode": {
 6817                    "type": "python",
 6818                    "name": "snpeff_ann_explode",
 6819                    "description": "Explode snpEff annotations with uniquify values",
 6820                    "available": True,
 6821                    "function_name": "calculation_snpeff_ann_explode",
 6822                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6823                },
 6824                "snpeff_ann_explode_uniquify": {
 6825                    "type": "python",
 6826                    "name": "snpeff_ann_explode_uniquify",
 6827                    "description": "Explode snpEff annotations",
 6828                    "available": True,
 6829                    "function_name": "calculation_snpeff_ann_explode",
 6830                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6831                },
 6832                "snpeff_ann_explode_json": {
 6833                    "type": "python",
 6834                    "name": "snpeff_ann_explode_json",
 6835                    "description": "Explode snpEff annotations in JSON format",
 6836                    "available": True,
 6837                    "function_name": "calculation_snpeff_ann_explode",
 6838                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6839                },
 6840                "NOMEN": {
 6841                    "type": "python",
 6842                    "name": "NOMEN",
 6843                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6844                    "available": True,
 6845                    "function_name": "calculation_extract_nomen",
 6846                    "function_params": [],
 6847                },
 6848                "RENAME_INFO_FIELDS": {
 6849                    "type": "python",
 6850                    "name": "RENAME_INFO_FIELDS",
 6851                    "description": "Rename or remove INFO/tags",
 6852                    "available": True,
 6853                    "function_name": "calculation_rename_info_fields",
 6854                    "function_params": [],
 6855                },
 6856                "FINDBYPIPELINE": {
 6857                    "type": "python",
 6858                    "name": "FINDBYPIPELINE",
 6859                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6860                    "available": True,
 6861                    "function_name": "calculation_find_by_pipeline",
 6862                    "function_params": ["findbypipeline"],
 6863                },
 6864                "FINDBYSAMPLE": {
 6865                    "type": "python",
 6866                    "name": "FINDBYSAMPLE",
 6867                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6868                    "available": True,
 6869                    "function_name": "calculation_find_by_pipeline",
 6870                    "function_params": ["findbysample"],
 6871                },
 6872                "GENOTYPECONCORDANCE": {
 6873                    "type": "python",
 6874                    "name": "GENOTYPECONCORDANCE",
 6875                    "description": "Concordance of genotype for multi caller VCF",
 6876                    "available": True,
 6877                    "function_name": "calculation_genotype_concordance",
 6878                    "function_params": [],
 6879                },
 6880                "BARCODE": {
 6881                    "type": "python",
 6882                    "name": "BARCODE",
 6883                    "description": "BARCODE as VaRank tool",
 6884                    "available": True,
 6885                    "function_name": "calculation_barcode",
 6886                    "function_params": [],
 6887                },
 6888                "BARCODEFAMILY": {
 6889                    "type": "python",
 6890                    "name": "BARCODEFAMILY",
 6891                    "description": "BARCODEFAMILY as VaRank tool",
 6892                    "available": True,
 6893                    "function_name": "calculation_barcode_family",
 6894                    "function_params": ["BCF"],
 6895                },
 6896                "TRIO": {
 6897                    "type": "python",
 6898                    "name": "TRIO",
 6899                    "description": "Inheritance for a trio family",
 6900                    "available": True,
 6901                    "function_name": "calculation_trio",
 6902                    "function_params": [],
 6903                },
 6904                "VAF": {
 6905                    "type": "python",
 6906                    "name": "VAF",
 6907                    "description": "Variant Allele Frequency (VAF) harmonization",
 6908                    "available": True,
 6909                    "function_name": "calculation_vaf_normalization",
 6910                    "function_params": [],
 6911                },
 6912                "VAF_stats": {
 6913                    "type": "python",
 6914                    "name": "VAF_stats",
 6915                    "description": "Variant Allele Frequency (VAF) statistics",
 6916                    "available": True,
 6917                    "function_name": "calculation_genotype_stats",
 6918                    "function_params": ["VAF"],
 6919                },
 6920                "DP_stats": {
 6921                    "type": "python",
 6922                    "name": "DP_stats",
 6923                    "description": "Depth (DP) statistics",
 6924                    "available": True,
 6925                    "function_name": "calculation_genotype_stats",
 6926                    "function_params": ["DP"],
 6927                },
 6928                "variant_id": {
 6929                    "type": "python",
 6930                    "name": "variant_id",
 6931                    "description": "Variant ID generated from variant position and type",
 6932                    "available": True,
 6933                    "function_name": "calculation_variant_id",
 6934                    "function_params": [],
 6935                },
 6936                "transcripts_json": {
 6937                    "type": "python",
 6938                    "name": "transcripts_json",
 6939                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6940                    "available": True,
 6941                    "function_name": "calculation_transcripts_annotation",
 6942                    "function_params": ["transcripts_json", None],
 6943                },
 6944                "transcripts_ann": {
 6945                    "type": "python",
 6946                    "name": "transcripts_ann",
 6947                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6948                    "available": True,
 6949                    "function_name": "calculation_transcripts_annotation",
 6950                    "function_params": [None, "transcripts_ann"],
 6951                },
 6952                "transcripts_annotations": {
 6953                    "type": "python",
 6954                    "name": "transcripts_annotations",
 6955                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6956                    "available": True,
 6957                    "function_name": "calculation_transcripts_annotation",
 6958                    "function_params": [None, None],
 6959                },
 6960                "transcripts_prioritization": {
 6961                    "type": "python",
 6962                    "name": "transcripts_prioritization",
 6963                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6964                    "available": True,
 6965                    "function_name": "calculation_transcripts_prioritization",
 6966                    "function_params": [],
 6967                },
 6968                "transcripts_export": {
 6969                    "type": "python",
 6970                    "name": "transcripts_export",
 6971                    "description": "Export transcripts table/view as a file (using param.json)",
 6972                    "available": True,
 6973                    "function_name": "calculation_transcripts_export",
 6974                    "function_params": [],
 6975                },
 6976            },
 6977            "prioritizations": {
 6978                "default": {
 6979                    "ANN2": [
 6980                        {
 6981                            "type": "contains",
 6982                            "value": "HIGH",
 6983                            "score": 5,
 6984                            "flag": "PASS",
 6985                            "comment": [
 6986                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6987                            ],
 6988                        },
 6989                        {
 6990                            "type": "contains",
 6991                            "value": "MODERATE",
 6992                            "score": 3,
 6993                            "flag": "PASS",
 6994                            "comment": [
 6995                                "A non-disruptive variant that might change protein effectiveness"
 6996                            ],
 6997                        },
 6998                        {
 6999                            "type": "contains",
 7000                            "value": "LOW",
 7001                            "score": 0,
 7002                            "flag": "FILTERED",
 7003                            "comment": [
 7004                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7005                            ],
 7006                        },
 7007                        {
 7008                            "type": "contains",
 7009                            "value": "MODIFIER",
 7010                            "score": 0,
 7011                            "flag": "FILTERED",
 7012                            "comment": [
 7013                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7014                            ],
 7015                        },
 7016                    ],
 7017                }
 7018            },
 7019        }
 7020
 7021        return config_default.get(name, None)
 7022
 7023    def get_config_json(
 7024        self, name: str, config_dict: dict = {}, config_file: str = None
 7025    ) -> dict:
 7026        """
 7027        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7028        default values, a dictionary, and a file.
 7029
 7030        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7031        the name of the configuration. It is used to identify and retrieve the configuration settings
 7032        for a specific component or module
 7033        :type name: str
 7034        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7035        dictionary that allows you to provide additional configuration settings or overrides. When you
 7036        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7037        the key is the configuration setting you want to override or
 7038        :type config_dict: dict
 7039        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7040        specify the path to a configuration file that contains additional settings. If provided, the
 7041        function will read the contents of this file and update the configuration dictionary with the
 7042        values found in the file, overriding any existing values with the
 7043        :type config_file: str
 7044        :return: The function `get_config_json` returns a dictionary containing the configuration
 7045        settings.
 7046        """
 7047
 7048        # Create with default prioritizations
 7049        config_default = self.get_config_default(name=name)
 7050        configuration = config_default
 7051        # log.debug(f"configuration={configuration}")
 7052
 7053        # Replace prioritizations from dict
 7054        for config in config_dict:
 7055            configuration[config] = config_dict[config]
 7056
 7057        # Replace prioritizations from file
 7058        config_file = full_path(config_file)
 7059        if config_file:
 7060            if os.path.exists(config_file):
 7061                with open(config_file) as config_file_content:
 7062                    config_file_dict = yaml.safe_load(config_file_content)
 7063                for config in config_file_dict:
 7064                    configuration[config] = config_file_dict[config]
 7065            else:
 7066                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7067                log.error(msg_error)
 7068                raise ValueError(msg_error)
 7069
 7070        return configuration
 7071
 7072    def prioritization(
 7073        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7074    ) -> bool:
 7075        """
 7076        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7077        prioritizes variants based on configured profiles and criteria.
 7078
 7079        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7080        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7081        a table name is provided, the method will prioritize the variants in that specific table
 7082        :type table: str
 7083        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7084        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7085        provided, the code will use a default prefix value of "PZ"
 7086        :type pz_prefix: str
 7087        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7088        additional parameters specific to the prioritization process. These parameters can include
 7089        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7090        configurations needed for the prioritization of variants in a V
 7091        :type pz_param: dict
 7092        :return: A boolean value (True) is being returned from the `prioritization` function.
 7093        """
 7094
 7095        # Config
 7096        config = self.get_config()
 7097
 7098        # Param
 7099        param = self.get_param()
 7100
 7101        # Prioritization param
 7102        if pz_param is not None:
 7103            prioritization_param = pz_param
 7104        else:
 7105            prioritization_param = param.get("prioritization", {})
 7106
 7107        # Configuration profiles
 7108        prioritization_config_file = prioritization_param.get(
 7109            "prioritization_config", None
 7110        )
 7111        prioritization_config_file = full_path(prioritization_config_file)
 7112        prioritizations_config = self.get_config_json(
 7113            name="prioritizations", config_file=prioritization_config_file
 7114        )
 7115
 7116        # Prioritization prefix
 7117        pz_prefix_default = "PZ"
 7118        if pz_prefix is None:
 7119            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7120
 7121        # Prioritization options
 7122        profiles = prioritization_param.get("profiles", [])
 7123        if isinstance(profiles, str):
 7124            profiles = profiles.split(",")
 7125        pzfields = prioritization_param.get(
 7126            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7127        )
 7128        if isinstance(pzfields, str):
 7129            pzfields = pzfields.split(",")
 7130        default_profile = prioritization_param.get("default_profile", None)
 7131        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7132        prioritization_score_mode = prioritization_param.get(
 7133            "prioritization_score_mode", "HOWARD"
 7134        )
 7135
 7136        # Quick Prioritizations
 7137        prioritizations = param.get("prioritizations", None)
 7138        if prioritizations:
 7139            log.info("Quick Prioritization:")
 7140            for profile in prioritizations.split(","):
 7141                if profile not in profiles:
 7142                    profiles.append(profile)
 7143                    log.info(f"   {profile}")
 7144
 7145        # If profile "ALL" provided, all profiles in the config profiles
 7146        if "ALL" in profiles:
 7147            profiles = list(prioritizations_config.keys())
 7148
 7149        for profile in profiles:
 7150            if prioritizations_config.get(profile, None):
 7151                log.debug(f"Profile '{profile}' configured")
 7152            else:
 7153                msg_error = f"Profile '{profile}' NOT configured"
 7154                log.error(msg_error)
 7155                raise ValueError(msg_error)
 7156
 7157        if profiles:
 7158            log.info(f"Prioritization... ")
 7159        else:
 7160            log.debug(f"No profile defined")
 7161            return False
 7162
 7163        if not default_profile and len(profiles):
 7164            default_profile = profiles[0]
 7165
 7166        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7167        log.debug("Profiles to check: " + str(list(profiles)))
 7168
 7169        # Variables
 7170        if table is not None:
 7171            table_variants = table
 7172        else:
 7173            table_variants = self.get_table_variants(clause="update")
 7174        log.debug(f"Table to prioritize: {table_variants}")
 7175
 7176        # Added columns
 7177        added_columns = []
 7178
 7179        # Create list of PZfields
 7180        # List of PZFields
 7181        list_of_pzfields_original = pzfields + [
 7182            pzfield + pzfields_sep + profile
 7183            for pzfield in pzfields
 7184            for profile in profiles
 7185        ]
 7186        list_of_pzfields = []
 7187        log.debug(f"{list_of_pzfields_original}")
 7188
 7189        # Remove existing PZfields to use if exists
 7190        for pzfield in list_of_pzfields_original:
 7191            if self.get_header().infos.get(pzfield, None) is None:
 7192                list_of_pzfields.append(pzfield)
 7193                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7194            else:
 7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7196
 7197        if list_of_pzfields:
 7198
 7199            # Explode Infos prefix
 7200            explode_infos_prefix = self.get_explode_infos_prefix()
 7201
 7202            # PZfields tags description
 7203            PZfields_INFOS = {
 7204                f"{pz_prefix}Tags": {
 7205                    "ID": f"{pz_prefix}Tags",
 7206                    "Number": ".",
 7207                    "Type": "String",
 7208                    "Description": "Variant tags based on annotation criteria",
 7209                },
 7210                f"{pz_prefix}Score": {
 7211                    "ID": f"{pz_prefix}Score",
 7212                    "Number": 1,
 7213                    "Type": "Integer",
 7214                    "Description": "Variant score based on annotation criteria",
 7215                },
 7216                f"{pz_prefix}Flag": {
 7217                    "ID": f"{pz_prefix}Flag",
 7218                    "Number": 1,
 7219                    "Type": "String",
 7220                    "Description": "Variant flag based on annotation criteria",
 7221                },
 7222                f"{pz_prefix}Comment": {
 7223                    "ID": f"{pz_prefix}Comment",
 7224                    "Number": ".",
 7225                    "Type": "String",
 7226                    "Description": "Variant comment based on annotation criteria",
 7227                },
 7228                f"{pz_prefix}Infos": {
 7229                    "ID": f"{pz_prefix}Infos",
 7230                    "Number": ".",
 7231                    "Type": "String",
 7232                    "Description": "Variant infos based on annotation criteria",
 7233                },
 7234                f"{pz_prefix}Class": {
 7235                    "ID": f"{pz_prefix}Class",
 7236                    "Number": ".",
 7237                    "Type": "String",
 7238                    "Description": "Variant class based on annotation criteria",
 7239                },
 7240            }
 7241
 7242            # Create INFO fields if not exist
 7243            for field in PZfields_INFOS:
 7244                field_ID = PZfields_INFOS[field]["ID"]
 7245                field_description = PZfields_INFOS[field]["Description"]
 7246                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7247                    field_description = (
 7248                        PZfields_INFOS[field]["Description"]
 7249                        + f", profile {default_profile}"
 7250                    )
 7251                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7252                        field_ID,
 7253                        PZfields_INFOS[field]["Number"],
 7254                        PZfields_INFOS[field]["Type"],
 7255                        field_description,
 7256                        "unknown",
 7257                        "unknown",
 7258                        code_type_map[PZfields_INFOS[field]["Type"]],
 7259                    )
 7260
 7261            # Create INFO fields if not exist for each profile
 7262            for profile in prioritizations_config:
 7263                if profile in profiles or profiles == []:
 7264                    for field in PZfields_INFOS:
 7265                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7266                        field_description = (
 7267                            PZfields_INFOS[field]["Description"]
 7268                            + f", profile {profile}"
 7269                        )
 7270                        if (
 7271                            field_ID not in self.get_header().infos
 7272                            and field in pzfields
 7273                        ):
 7274                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7275                                field_ID,
 7276                                PZfields_INFOS[field]["Number"],
 7277                                PZfields_INFOS[field]["Type"],
 7278                                field_description,
 7279                                "unknown",
 7280                                "unknown",
 7281                                code_type_map[PZfields_INFOS[field]["Type"]],
 7282                            )
 7283
 7284            # Header
 7285            for pzfield in list_of_pzfields:
 7286                if re.match(f"{pz_prefix}Score.*", pzfield):
 7287                    added_column = self.add_column(
 7288                        table_name=table_variants,
 7289                        column_name=pzfield,
 7290                        column_type="INTEGER",
 7291                        default_value="0",
 7292                    )
 7293                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7294                    added_column = self.add_column(
 7295                        table_name=table_variants,
 7296                        column_name=pzfield,
 7297                        column_type="BOOLEAN",
 7298                        default_value="1",
 7299                    )
 7300                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7301                    added_column = self.add_column(
 7302                        table_name=table_variants,
 7303                        column_name=pzfield,
 7304                        column_type="VARCHAR[]",
 7305                        default_value="null",
 7306                    )
 7307                else:
 7308                    added_column = self.add_column(
 7309                        table_name=table_variants,
 7310                        column_name=pzfield,
 7311                        column_type="STRING",
 7312                        default_value="''",
 7313                    )
 7314                added_columns.append(added_column)
 7315
 7316            # Profiles
 7317            if profiles:
 7318
 7319                # foreach profile in configuration file
 7320                for profile in prioritizations_config:
 7321
 7322                    # If profile is asked in param, or ALL are asked (empty profile [])
 7323                    if profile in profiles or profiles == []:
 7324                        log.info(f"Profile '{profile}'")
 7325
 7326                        sql_set_info_option = ""
 7327
 7328                        sql_set_info = []
 7329
 7330                        # PZ fields set
 7331
 7332                        # PZScore
 7333                        if (
 7334                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7335                            in list_of_pzfields
 7336                        ):
 7337                            sql_set_info.append(
 7338                                f"""
 7339                                    concat(
 7340                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7341                                        {pz_prefix}Score{pzfields_sep}{profile}
 7342                                    ) 
 7343                                """
 7344                            )
 7345                            if (
 7346                                profile == default_profile
 7347                                and f"{pz_prefix}Score" in list_of_pzfields
 7348                            ):
 7349                                sql_set_info.append(
 7350                                    f"""
 7351                                        concat(
 7352                                            '{pz_prefix}Score=',
 7353                                            {pz_prefix}Score{pzfields_sep}{profile}
 7354                                        )
 7355                                    """
 7356                                )
 7357
 7358                        # PZFlag
 7359                        if (
 7360                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7361                            in list_of_pzfields
 7362                        ):
 7363                            sql_set_info.append(
 7364                                f"""
 7365                                    concat(
 7366                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7367                                        CASE 
 7368                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7369                                            THEN 'PASS'
 7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7371                                            THEN 'FILTERED'
 7372                                        END
 7373                                    ) 
 7374                                """
 7375                            )
 7376                            if (
 7377                                profile == default_profile
 7378                                and f"{pz_prefix}Flag" in list_of_pzfields
 7379                            ):
 7380                                sql_set_info.append(
 7381                                    f"""
 7382                                        concat(
 7383                                            '{pz_prefix}Flag=',
 7384                                            CASE 
 7385                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7386                                                THEN 'PASS'
 7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7388                                                THEN 'FILTERED'
 7389                                            END
 7390                                        )
 7391                                    """
 7392                                )
 7393
 7394                        # PZClass
 7395                        if (
 7396                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7397                            in list_of_pzfields
 7398                        ):
 7399                            sql_set_info.append(
 7400                                f"""
 7401                                    concat(
 7402                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7403                                        CASE
 7404                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7405                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7406                                            ELSE '.'
 7407                                        END 
 7408                                    )
 7409                                    
 7410                                """
 7411                            )
 7412                            if (
 7413                                profile == default_profile
 7414                                and f"{pz_prefix}Class" in list_of_pzfields
 7415                            ):
 7416                                sql_set_info.append(
 7417                                    f"""
 7418                                        concat(
 7419                                            '{pz_prefix}Class=',
 7420                                            CASE
 7421                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7422                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7423                                                ELSE '.'
 7424                                            END 
 7425                                        )
 7426                                    """
 7427                                )
 7428
 7429                        # PZComment
 7430                        if (
 7431                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7432                            in list_of_pzfields
 7433                        ):
 7434                            sql_set_info.append(
 7435                                f"""
 7436                                    CASE
 7437                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7438                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7439                                        ELSE ''
 7440                                    END
 7441                                """
 7442                            )
 7443                            if (
 7444                                profile == default_profile
 7445                                and f"{pz_prefix}Comment" in list_of_pzfields
 7446                            ):
 7447                                sql_set_info.append(
 7448                                    f"""
 7449                                        CASE
 7450                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7451                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7452                                            ELSE ''
 7453                                        END
 7454                                    """
 7455                                )
 7456
 7457                        # PZInfos
 7458                        if (
 7459                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7460                            in list_of_pzfields
 7461                        ):
 7462                            sql_set_info.append(
 7463                                f"""
 7464                                    CASE
 7465                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7466                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7467                                        ELSE ''
 7468                                    END
 7469                                """
 7470                            )
 7471                            if (
 7472                                profile == default_profile
 7473                                and f"{pz_prefix}Infos" in list_of_pzfields
 7474                            ):
 7475                                sql_set_info.append(
 7476                                    f"""
 7477                                        CASE
 7478                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7479                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7480                                            ELSE ''
 7481                                        END
 7482                                    """
 7483                                )
 7484
 7485                        # Merge PZfields
 7486                        sql_set_info_option = ""
 7487                        sql_set_sep = ""
 7488                        for sql_set in sql_set_info:
 7489                            if sql_set_sep:
 7490                                sql_set_info_option += f"""
 7491                                    , concat('{sql_set_sep}', {sql_set})
 7492                                """
 7493                            else:
 7494                                sql_set_info_option += f"""
 7495                                    , {sql_set}
 7496                                """
 7497                            sql_set_sep = ";"
 7498
 7499                        sql_queries = []
 7500                        for annotation in prioritizations_config[profile]:
 7501
 7502                            # skip special sections
 7503                            if annotation.startswith("_"):
 7504                                continue
 7505
 7506                            # For each criterions
 7507                            for criterion in prioritizations_config[profile][
 7508                                annotation
 7509                            ]:
 7510
 7511                                # Criterion mode
 7512                                criterion_mode = None
 7513                                if np.any(
 7514                                    np.isin(list(criterion.keys()), ["type", "value"])
 7515                                ):
 7516                                    criterion_mode = "operation"
 7517                                elif np.any(
 7518                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7519                                ):
 7520                                    criterion_mode = "sql"
 7521                                log.debug(f"Criterion Mode: {criterion_mode}")
 7522
 7523                                # Criterion parameters
 7524                                criterion_type = criterion.get("type", None)
 7525                                criterion_value = criterion.get("value", None)
 7526                                criterion_sql = criterion.get("sql", None)
 7527                                criterion_fields = criterion.get("fields", None)
 7528                                criterion_score = criterion.get("score", 0)
 7529                                criterion_flag = criterion.get("flag", "PASS")
 7530                                criterion_class = criterion.get("class", None)
 7531                                criterion_flag_bool = criterion_flag == "PASS"
 7532                                criterion_comment = (
 7533                                    ", ".join(criterion.get("comment", []))
 7534                                    .replace("'", "''")
 7535                                    .replace(";", ",")
 7536                                    .replace("\t", " ")
 7537                                )
 7538                                criterion_infos = (
 7539                                    str(criterion)
 7540                                    .replace("'", "''")
 7541                                    .replace(";", ",")
 7542                                    .replace("\t", " ")
 7543                                )
 7544
 7545                                # SQL
 7546                                if criterion_sql is not None and isinstance(
 7547                                    criterion_sql, list
 7548                                ):
 7549                                    criterion_sql = " ".join(criterion_sql)
 7550
 7551                                # Fields and explode
 7552                                if criterion_fields is None:
 7553                                    criterion_fields = [annotation]
 7554                                if not isinstance(criterion_fields, list):
 7555                                    criterion_fields = str(criterion_fields).split(",")
 7556
 7557                                # Class
 7558                                if criterion_class is not None and not isinstance(
 7559                                    criterion_class, list
 7560                                ):
 7561                                    criterion_class = str(criterion_class).split(",")
 7562
 7563                                for annotation_field in criterion_fields:
 7564
 7565                                    # Explode specific annotation
 7566                                    log.debug(
 7567                                        f"Explode annotation '{annotation_field}'"
 7568                                    )
 7569                                    added_columns += self.explode_infos(
 7570                                        prefix=explode_infos_prefix,
 7571                                        fields=[annotation_field],
 7572                                        table=table_variants,
 7573                                    )
 7574                                    extra_infos = self.get_extra_infos(
 7575                                        table=table_variants
 7576                                    )
 7577
 7578                                    # Check if annotation field is present
 7579                                    if (
 7580                                        f"{explode_infos_prefix}{annotation_field}"
 7581                                        not in extra_infos
 7582                                    ):
 7583                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7584                                        log.error(msq_err)
 7585                                        raise ValueError(msq_err)
 7586                                    else:
 7587                                        log.debug(
 7588                                            f"Annotation '{annotation_field}' in data"
 7589                                        )
 7590
 7591                                sql_set = []
 7592                                sql_set_info = []
 7593
 7594                                # PZ fields set
 7595
 7596                                # PZScore
 7597                                if (
 7598                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7599                                    in list_of_pzfields
 7600                                ):
 7601                                    # VaRank prioritization score mode
 7602                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
 7603                                        sql_set.append(
 7604                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7605                                        )
 7606                                    # default HOWARD prioritization score mode
 7607                                    else:
 7608                                        sql_set.append(
 7609                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7610                                        )
 7611
 7612                                # PZFlag
 7613                                if (
 7614                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7615                                    in list_of_pzfields
 7616                                ):
 7617                                    sql_set.append(
 7618                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7619                                    )
 7620
 7621                                # PZClass
 7622                                if (
 7623                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7624                                    in list_of_pzfields
 7625                                    and criterion_class is not None
 7626                                ):
 7627                                    sql_set.append(
 7628                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7629                                    )
 7630
 7631                                # PZComment
 7632                                if (
 7633                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7634                                    in list_of_pzfields
 7635                                ):
 7636                                    sql_set.append(
 7637                                        f"""
 7638                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7639                                                concat(
 7640                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7641                                                    CASE 
 7642                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7643                                                        THEN ', '
 7644                                                        ELSE ''
 7645                                                    END,
 7646                                                    '{criterion_comment}'
 7647                                                )
 7648                                        """
 7649                                    )
 7650
 7651                                # PZInfos
 7652                                if (
 7653                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7654                                    in list_of_pzfields
 7655                                ):
 7656                                    sql_set.append(
 7657                                        f"""
 7658                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7659                                                concat(
 7660                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7661                                                    '{criterion_infos}'
 7662                                                )
 7663                                        """
 7664                                    )
 7665                                sql_set_option = ",".join(sql_set)
 7666
 7667                                # Criterion and comparison
 7668                                if sql_set_option:
 7669
 7670                                    if criterion_mode in ["operation"]:
 7671
 7672                                        try:
 7673                                            float(criterion_value)
 7674                                            sql_update = f"""
 7675                                                UPDATE {table_variants}
 7676                                                SET {sql_set_option}
 7677                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7678                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7679                                            """
 7680                                        except:
 7681                                            contains_option = ""
 7682                                            if criterion_type == "contains":
 7683                                                contains_option = ".*"
 7684                                            sql_update = f"""
 7685                                                UPDATE {table_variants}
 7686                                                SET {sql_set_option}
 7687                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7688                                            """
 7689                                        sql_queries.append(sql_update)
 7690
 7691                                    elif criterion_mode in ["sql"]:
 7692
 7693                                        sql_update = f"""
 7694                                            UPDATE {table_variants}
 7695                                            SET {sql_set_option}
 7696                                            WHERE {criterion_sql}
 7697                                        """
 7698                                        sql_queries.append(sql_update)
 7699
 7700                                    else:
 7701                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7702                                        log.error(msg_err)
 7703                                        raise ValueError(msg_err)
 7704
 7705                                else:
 7706                                    log.warning(
 7707                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7708                                    )
 7709
 7710                        # PZTags
 7711                        if (
 7712                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7713                            in list_of_pzfields
 7714                        ):
 7715
 7716                            # Create PZFalgs value
 7717                            pztags_value = ""
 7718                            pztags_sep_default = ","
 7719                            pztags_sep = ""
 7720                            for pzfield in pzfields:
 7721                                if pzfield not in [f"{pz_prefix}Tags"]:
 7722                                    if (
 7723                                        f"{pzfield}{pzfields_sep}{profile}"
 7724                                        in list_of_pzfields
 7725                                    ):
 7726                                        if pzfield in [f"{pz_prefix}Flag"]:
 7727                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7728                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7729                                                    THEN 'PASS'
 7730                                                    ELSE 'FILTERED'
 7731                                                END, '"""
 7732                                        elif pzfield in [f"{pz_prefix}Class"]:
 7733                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7734                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7735                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7736                                                    ELSE '.'
 7737                                                END, '"""
 7738                                        else:
 7739                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7740                                        pztags_sep = pztags_sep_default
 7741
 7742                            # Add Query update for PZFlags
 7743                            sql_update_pztags = f"""
 7744                                UPDATE {table_variants}
 7745                                SET INFO = concat(
 7746                                        INFO,
 7747                                        CASE WHEN INFO NOT in ('','.')
 7748                                                THEN ';'
 7749                                                ELSE ''
 7750                                        END,
 7751                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7752                                    )
 7753                                """
 7754                            sql_queries.append(sql_update_pztags)
 7755
 7756                            # Add Query update for PZFlags for default
 7757                            if profile == default_profile:
 7758                                sql_update_pztags_default = f"""
 7759                                UPDATE {table_variants}
 7760                                SET INFO = concat(
 7761                                        INFO,
 7762                                        ';',
 7763                                        '{pz_prefix}Tags={pztags_value}'
 7764                                    )
 7765                                """
 7766                                sql_queries.append(sql_update_pztags_default)
 7767
 7768                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7769
 7770                        if sql_queries:
 7771
 7772                            for sql_query in sql_queries:
 7773                                log.debug(
 7774                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7775                                )
 7776                                self.conn.execute(sql_query)
 7777
 7778                        log.info(f"""Profile '{profile}' - Update... """)
 7779                        sql_query_update = f"""
 7780                            UPDATE {table_variants}
 7781                            SET INFO =  
 7782                                concat(
 7783                                    CASE
 7784                                        WHEN INFO NOT IN ('','.')
 7785                                        THEN concat(INFO, ';')
 7786                                        ELSE ''
 7787                                    END
 7788                                    {sql_set_info_option}
 7789                                )
 7790                        """
 7791                        self.conn.execute(sql_query_update)
 7792
 7793        else:
 7794
 7795            log.warning(f"No profiles in parameters")
 7796
 7797        # Remove added columns
 7798        for added_column in added_columns:
 7799            self.drop_column(column=added_column)
 7800
 7801        # Explode INFOS fields into table fields
 7802        if self.get_explode_infos():
 7803            self.explode_infos(
 7804                prefix=self.get_explode_infos_prefix(),
 7805                fields=self.get_explode_infos_fields(),
 7806                force=True,
 7807            )
 7808
 7809        return True
 7810
 7811    ###
 7812    # HGVS
 7813    ###
 7814
 7815    def annotation_hgvs(self, threads: int = None) -> None:
 7816        """
 7817        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7818        coordinates and alleles.
 7819
 7820        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7821        threads to use for parallel processing. If no value is provided, it will default to the number
 7822        of threads obtained from the `get_threads()` method
 7823        :type threads: int
 7824        """
 7825
 7826        # Function for each partition of the Dask Dataframe
 7827        def partition_function(partition):
 7828            """
 7829            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7830            each row of a DataFrame called `partition`.
 7831
 7832            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7833            to be processed
 7834            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7835            the "partition" dataframe along the axis 1.
 7836            """
 7837            return partition.apply(annotation_hgvs_partition, axis=1)
 7838
 7839        def annotation_hgvs_partition(row) -> str:
 7840            """
 7841            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7842            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7843
 7844            :param row: A dictionary-like object that contains the values for the following keys:
 7845            :return: a string that contains the HGVS names associated with the given row of data.
 7846            """
 7847
 7848            chr = row["CHROM"]
 7849            pos = row["POS"]
 7850            ref = row["REF"]
 7851            alt = row["ALT"]
 7852
 7853            # Find list of associated transcripts
 7854            transcripts_list = list(
 7855                polars_conn.execute(
 7856                    f"""
 7857                SELECT transcript
 7858                FROM refseq_df
 7859                WHERE CHROM='{chr}'
 7860                AND POS={pos}
 7861            """
 7862                )["transcript"]
 7863            )
 7864
 7865            # Full HGVS annotation in list
 7866            hgvs_full_list = []
 7867
 7868            for transcript_name in transcripts_list:
 7869
 7870                # Transcript
 7871                transcript = get_transcript(
 7872                    transcripts=transcripts, transcript_name=transcript_name
 7873                )
 7874                # Exon
 7875                if use_exon:
 7876                    exon = transcript.find_exon_number(pos)
 7877                else:
 7878                    exon = None
 7879                # Protein
 7880                transcript_protein = None
 7881                if use_protein or add_protein or full_format:
 7882                    transcripts_protein = list(
 7883                        polars_conn.execute(
 7884                            f"""
 7885                        SELECT protein
 7886                        FROM refseqlink_df
 7887                        WHERE transcript='{transcript_name}'
 7888                        LIMIT 1
 7889                    """
 7890                        )["protein"]
 7891                    )
 7892                    if len(transcripts_protein):
 7893                        transcript_protein = transcripts_protein[0]
 7894
 7895                # HGVS name
 7896                hgvs_name = format_hgvs_name(
 7897                    chr,
 7898                    pos,
 7899                    ref,
 7900                    alt,
 7901                    genome=genome,
 7902                    transcript=transcript,
 7903                    transcript_protein=transcript_protein,
 7904                    exon=exon,
 7905                    use_gene=use_gene,
 7906                    use_protein=use_protein,
 7907                    full_format=full_format,
 7908                    use_version=use_version,
 7909                    codon_type=codon_type,
 7910                )
 7911                hgvs_full_list.append(hgvs_name)
 7912                if add_protein and not use_protein and not full_format:
 7913                    hgvs_name = format_hgvs_name(
 7914                        chr,
 7915                        pos,
 7916                        ref,
 7917                        alt,
 7918                        genome=genome,
 7919                        transcript=transcript,
 7920                        transcript_protein=transcript_protein,
 7921                        exon=exon,
 7922                        use_gene=use_gene,
 7923                        use_protein=True,
 7924                        full_format=False,
 7925                        use_version=use_version,
 7926                        codon_type=codon_type,
 7927                    )
 7928                    hgvs_full_list.append(hgvs_name)
 7929
 7930            # Create liste of HGVS annotations
 7931            hgvs_full = ",".join(hgvs_full_list)
 7932
 7933            return hgvs_full
 7934
 7935        # Polars connexion
 7936        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7937
 7938        # Config
 7939        config = self.get_config()
 7940
 7941        # Databases
 7942        # Genome
 7943        databases_genomes_folders = (
 7944            config.get("folders", {})
 7945            .get("databases", {})
 7946            .get("genomes", DEFAULT_GENOME_FOLDER)
 7947        )
 7948        databases_genome = (
 7949            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7950        )
 7951        # refseq database folder
 7952        databases_refseq_folders = (
 7953            config.get("folders", {})
 7954            .get("databases", {})
 7955            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7956        )
 7957        # refseq
 7958        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7959        # refSeqLink
 7960        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7961
 7962        # Param
 7963        param = self.get_param()
 7964
 7965        # Quick HGVS
 7966        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7967            log.info(f"Quick HGVS Annotation:")
 7968            if not param.get("hgvs", None):
 7969                param["hgvs"] = {}
 7970            for option in param.get("hgvs_options", "").split(","):
 7971                option_var_val = option.split("=")
 7972                option_var = option_var_val[0]
 7973                if len(option_var_val) > 1:
 7974                    option_val = option_var_val[1]
 7975                else:
 7976                    option_val = "True"
 7977                if option_val.upper() in ["TRUE"]:
 7978                    option_val = True
 7979                elif option_val.upper() in ["FALSE"]:
 7980                    option_val = False
 7981                log.info(f"   {option_var}={option_val}")
 7982                param["hgvs"][option_var] = option_val
 7983
 7984        # Check if HGVS annotation enabled
 7985        if "hgvs" in param:
 7986            log.info(f"HGVS Annotation... ")
 7987            for hgvs_option in param.get("hgvs", {}):
 7988                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7989        else:
 7990            return
 7991
 7992        # HGVS Param
 7993        param_hgvs = param.get("hgvs", {})
 7994        use_exon = param_hgvs.get("use_exon", False)
 7995        use_gene = param_hgvs.get("use_gene", False)
 7996        use_protein = param_hgvs.get("use_protein", False)
 7997        add_protein = param_hgvs.get("add_protein", False)
 7998        full_format = param_hgvs.get("full_format", False)
 7999        use_version = param_hgvs.get("use_version", False)
 8000        codon_type = param_hgvs.get("codon_type", "3")
 8001
 8002        # refSseq refSeqLink
 8003        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8004        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8005
 8006        # Assembly
 8007        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8008
 8009        # Genome
 8010        genome_file = None
 8011        if find_genome(databases_genome):
 8012            genome_file = find_genome(databases_genome)
 8013        else:
 8014            genome_file = find_genome(
 8015                genome_path=databases_genomes_folders, assembly=assembly
 8016            )
 8017        log.debug("Genome: " + str(genome_file))
 8018
 8019        # refSseq
 8020        refseq_file = find_file_prefix(
 8021            input_file=databases_refseq,
 8022            prefix="ncbiRefSeq",
 8023            folder=databases_refseq_folders,
 8024            assembly=assembly,
 8025        )
 8026        log.debug("refSeq: " + str(refseq_file))
 8027
 8028        # refSeqLink
 8029        refseqlink_file = find_file_prefix(
 8030            input_file=databases_refseqlink,
 8031            prefix="ncbiRefSeqLink",
 8032            folder=databases_refseq_folders,
 8033            assembly=assembly,
 8034        )
 8035        log.debug("refSeqLink: " + str(refseqlink_file))
 8036
 8037        # Threads
 8038        if not threads:
 8039            threads = self.get_threads()
 8040        log.debug("Threads: " + str(threads))
 8041
 8042        # Variables
 8043        table_variants = self.get_table_variants(clause="update")
 8044
 8045        # Get variants SNV and InDel only
 8046        query_variants = f"""
 8047            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8048            FROM {table_variants}
 8049            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8050            """
 8051        df_variants = self.get_query_to_df(query_variants)
 8052
 8053        # Added columns
 8054        added_columns = []
 8055
 8056        # Add hgvs column in variants table
 8057        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8058        added_column = self.add_column(
 8059            table_variants, hgvs_column_name, "STRING", default_value=None
 8060        )
 8061        added_columns.append(added_column)
 8062
 8063        log.debug(f"refSeq loading...")
 8064        # refSeq in duckDB
 8065        refseq_table = get_refseq_table(
 8066            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8067        )
 8068        # Loading all refSeq in Dataframe
 8069        refseq_query = f"""
 8070            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8071            FROM {refseq_table}
 8072            JOIN df_variants ON (
 8073                {refseq_table}.chrom = df_variants.CHROM
 8074                AND {refseq_table}.txStart<=df_variants.POS
 8075                AND {refseq_table}.txEnd>=df_variants.POS
 8076            )
 8077        """
 8078        refseq_df = self.conn.query(refseq_query).pl()
 8079
 8080        if refseqlink_file:
 8081            log.debug(f"refSeqLink loading...")
 8082            # refSeqLink in duckDB
 8083            refseqlink_table = get_refseq_table(
 8084                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8085            )
 8086            # Loading all refSeqLink in Dataframe
 8087            protacc_column = "protAcc_with_ver"
 8088            mrnaacc_column = "mrnaAcc_with_ver"
 8089            refseqlink_query = f"""
 8090                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8091                FROM {refseqlink_table} 
 8092                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8093                WHERE protAcc_without_ver IS NOT NULL
 8094            """
 8095            # Polars Dataframe
 8096            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8097
 8098        # Read RefSeq transcripts into a python dict/model.
 8099        log.debug(f"Transcripts loading...")
 8100        with tempfile.TemporaryDirectory() as tmpdir:
 8101            transcripts_query = f"""
 8102                COPY (
 8103                    SELECT {refseq_table}.*
 8104                    FROM {refseq_table}
 8105                    JOIN df_variants ON (
 8106                        {refseq_table}.chrom=df_variants.CHROM
 8107                        AND {refseq_table}.txStart<=df_variants.POS
 8108                        AND {refseq_table}.txEnd>=df_variants.POS
 8109                    )
 8110                )
 8111                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8112            """
 8113            self.conn.query(transcripts_query)
 8114            with open(f"{tmpdir}/transcript.tsv") as infile:
 8115                transcripts = read_transcripts(infile)
 8116
 8117        # Polars connexion
 8118        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8119
 8120        log.debug("Genome loading...")
 8121        # Read genome sequence using pyfaidx.
 8122        genome = Fasta(genome_file)
 8123
 8124        log.debug("Start annotation HGVS...")
 8125
 8126        # Create
 8127        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8128        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8129
 8130        # Use dask.dataframe.apply() to apply function on each partition
 8131        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8132
 8133        # Convert Dask DataFrame to Pandas Dataframe
 8134        df = ddf.compute()
 8135
 8136        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8137        with tempfile.TemporaryDirectory() as tmpdir:
 8138            df_parquet = os.path.join(tmpdir, "df.parquet")
 8139            df.to_parquet(df_parquet)
 8140
 8141            # Update hgvs column
 8142            update_variant_query = f"""
 8143                UPDATE {table_variants}
 8144                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8145                FROM read_parquet('{df_parquet}') as df
 8146                WHERE variants."#CHROM" = df.CHROM
 8147                AND variants.POS = df.POS
 8148                AND variants.REF = df.REF
 8149                AND variants.ALT = df.ALT
 8150                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8151                """
 8152            self.execute_query(update_variant_query)
 8153
 8154        # Update INFO column
 8155        sql_query_update = f"""
 8156            UPDATE {table_variants}
 8157            SET INFO = 
 8158                concat(
 8159                    CASE 
 8160                        WHEN INFO NOT IN ('','.')
 8161                        THEN concat(INFO, ';')
 8162                        ELSE ''
 8163                    END,
 8164                    'hgvs=',
 8165                    {hgvs_column_name}
 8166                )
 8167            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8168            """
 8169        self.execute_query(sql_query_update)
 8170
 8171        # Add header
 8172        HGVS_INFOS = {
 8173            "hgvs": {
 8174                "ID": "hgvs",
 8175                "Number": ".",
 8176                "Type": "String",
 8177                "Description": f"HGVS annotatation with HOWARD",
 8178            }
 8179        }
 8180
 8181        for field in HGVS_INFOS:
 8182            field_ID = HGVS_INFOS[field]["ID"]
 8183            field_description = HGVS_INFOS[field]["Description"]
 8184            self.get_header().infos[field_ID] = vcf.parser._Info(
 8185                field_ID,
 8186                HGVS_INFOS[field]["Number"],
 8187                HGVS_INFOS[field]["Type"],
 8188                field_description,
 8189                "unknown",
 8190                "unknown",
 8191                code_type_map[HGVS_INFOS[field]["Type"]],
 8192            )
 8193
 8194        # Remove added columns
 8195        for added_column in added_columns:
 8196            self.drop_column(column=added_column)
 8197
 8198    ###
 8199    # Calculation
 8200    ###
 8201
 8202    def get_operations_help(
 8203        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8204    ) -> list:
 8205
 8206        # Init
 8207        operations_help = []
 8208
 8209        # operations
 8210        operations = self.get_config_json(
 8211            name="calculations",
 8212            config_dict=operations_config_dict,
 8213            config_file=operations_config_file,
 8214        )
 8215        for op in operations:
 8216            op_name = operations[op].get("name", op).upper()
 8217            op_description = operations[op].get("description", op_name)
 8218            op_available = operations[op].get("available", False)
 8219            if op_available:
 8220                operations_help.append(f"   {op_name}: {op_description}")
 8221
 8222        # Sort operations
 8223        operations_help.sort()
 8224
 8225        # insert header
 8226        operations_help.insert(0, "Available calculation operations:")
 8227
 8228        # Return
 8229        return operations_help
 8230
 8231    def calculation(
 8232        self,
 8233        operations: dict = {},
 8234        operations_config_dict: dict = {},
 8235        operations_config_file: str = None,
 8236    ) -> None:
 8237        """
 8238        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8239        operation, and then calls the appropriate function
 8240
 8241        param json example:
 8242            "calculation": {
 8243                "NOMEN": {
 8244                    "options": {
 8245                        "hgvs_field": "hgvs"
 8246                    },
 8247                "middle" : null
 8248            }
 8249        """
 8250
 8251        # Param
 8252        param = self.get_param()
 8253
 8254        # CHeck operations config file
 8255        if operations_config_file is None:
 8256            operations_config_file = param.get("calculation", {}).get(
 8257                "calculation_config", None
 8258            )
 8259
 8260        # operations config
 8261        operations_config = self.get_config_json(
 8262            name="calculations",
 8263            config_dict=operations_config_dict,
 8264            config_file=operations_config_file,
 8265        )
 8266
 8267        # Upper keys
 8268        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8269
 8270        # Calculations
 8271
 8272        # Operations from param
 8273        operations = param.get("calculation", {}).get("calculations", operations)
 8274
 8275        # Quick calculation - add
 8276        if param.get("calculations", None):
 8277
 8278            # List of operations
 8279            calculations_list = [
 8280                value.strip() for value in param.get("calculations", "").split(",")
 8281            ]
 8282
 8283            # Log
 8284            log.info(f"Quick Calculations:")
 8285            for calculation_key in calculations_list:
 8286                log.info(f"   {calculation_key}")
 8287
 8288            # Create tmp operations (to keep operation order)
 8289            operations_tmp = {}
 8290            for calculation_operation in calculations_list:
 8291                if calculation_operation.upper() not in operations_tmp:
 8292                    log.debug(
 8293                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8294                    )
 8295                    operations_tmp[calculation_operation.upper()] = {}
 8296                    add_value_into_dict(
 8297                        dict_tree=operations_tmp,
 8298                        sections=[
 8299                            calculation_operation.upper(),
 8300                        ],
 8301                        value=operations.get(calculation_operation.upper(), {}),
 8302                    )
 8303            # Add operations already in param
 8304            for calculation_operation in operations:
 8305                if calculation_operation not in operations_tmp:
 8306                    operations_tmp[calculation_operation] = operations.get(
 8307                        calculation_operation, {}
 8308                    )
 8309
 8310            # Update operations in param
 8311            operations = operations_tmp
 8312
 8313        # Operations for calculation
 8314        if not operations:
 8315            operations = param.get("calculation", {}).get("calculations", {})
 8316
 8317        if operations:
 8318            log.info(f"Calculations...")
 8319
 8320        # For each operations
 8321        for operation_name in operations:
 8322            operation_name = operation_name.upper()
 8323            if operation_name not in [""]:
 8324                if operation_name in operations_config:
 8325                    log.info(f"Calculation '{operation_name}'")
 8326                    operation = operations_config[operation_name]
 8327                    operation_type = operation.get("type", "sql")
 8328                    if operation_type == "python":
 8329                        self.calculation_process_function(
 8330                            operation=operation, operation_name=operation_name
 8331                        )
 8332                    elif operation_type == "sql":
 8333                        self.calculation_process_sql(
 8334                            operation=operation, operation_name=operation_name
 8335                        )
 8336                    else:
 8337                        log.error(
 8338                            f"Operations config: Type '{operation_type}' NOT available"
 8339                        )
 8340                        raise ValueError(
 8341                            f"Operations config: Type '{operation_type}' NOT available"
 8342                        )
 8343                else:
 8344                    log.error(
 8345                        f"Operations config: Calculation '{operation_name}' NOT available"
 8346                    )
 8347                    raise ValueError(
 8348                        f"Operations config: Calculation '{operation_name}' NOT available"
 8349                    )
 8350
 8351        # Explode INFOS fields into table fields
 8352        if self.get_explode_infos():
 8353            self.explode_infos(
 8354                prefix=self.get_explode_infos_prefix(),
 8355                fields=self.get_explode_infos_fields(),
 8356                force=True,
 8357            )
 8358
 8359    def calculation_process_sql(
 8360        self, operation: dict, operation_name: str = "unknown"
 8361    ) -> None:
 8362        """
 8363        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8364        performs the operation, updating the specified table with the result.
 8365
 8366        :param operation: The `operation` parameter is a dictionary that contains information about the
 8367        mathematical operation to be performed. It includes the following keys:
 8368        :type operation: dict
 8369        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8370        the mathematical operation being performed. It is used for logging and error handling purposes,
 8371        defaults to unknown
 8372        :type operation_name: str (optional)
 8373        """
 8374
 8375        # Operation infos
 8376        operation_name = operation.get("name", "unknown")
 8377        log.debug(f"process SQL {operation_name}")
 8378        output_column_name = operation.get("output_column_name", operation_name)
 8379        output_column_type = operation.get("output_column_type", "String")
 8380        prefix = operation.get("explode_infos_prefix", "")
 8381        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8382        output_column_description = operation.get(
 8383            "output_column_description", f"{operation_name} operation"
 8384        )
 8385        operation_query = operation.get("operation_query", None)
 8386        if isinstance(operation_query, list):
 8387            operation_query = " ".join(operation_query)
 8388        operation_info_fields = operation.get("info_fields", [])
 8389        operation_info_fields_check = operation.get("info_fields_check", False)
 8390        operation_info = operation.get("operation_info", True)
 8391        operation_table = operation.get(
 8392            "table", self.get_table_variants(clause="alter")
 8393        )
 8394
 8395        # table variants
 8396        if operation_table:
 8397            table_variants = operation_table
 8398        else:
 8399            table_variants = self.get_table_variants(clause="alter")
 8400
 8401        if operation_query:
 8402
 8403            # Info fields check
 8404            operation_info_fields_check_result = True
 8405            if operation_info_fields_check:
 8406                header_infos = self.get_header().infos
 8407                for info_field in operation_info_fields:
 8408                    operation_info_fields_check_result = (
 8409                        operation_info_fields_check_result
 8410                        and info_field in header_infos
 8411                    )
 8412
 8413            # If info fields available
 8414            if operation_info_fields_check_result:
 8415
 8416                # Added_columns
 8417                added_columns = []
 8418
 8419                # Create VCF header field
 8420                vcf_reader = self.get_header()
 8421                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8422                    output_column_name,
 8423                    ".",
 8424                    output_column_type,
 8425                    output_column_description,
 8426                    "howard calculation",
 8427                    "0",
 8428                    self.code_type_map.get(output_column_type),
 8429                )
 8430
 8431                # Explode infos if needed
 8432                log.debug(f"calculation_process_sql prefix {prefix}")
 8433                added_columns += self.explode_infos(
 8434                    prefix=prefix,
 8435                    fields=[output_column_name] + operation_info_fields,
 8436                    force=False,
 8437                    table=table_variants,
 8438                )
 8439
 8440                # Create column
 8441                added_column = self.add_column(
 8442                    table_name=table_variants,
 8443                    column_name=prefix + output_column_name,
 8444                    column_type=output_column_type_sql,
 8445                    default_value="null",
 8446                )
 8447                added_columns.append(added_column)
 8448
 8449                # Operation calculation
 8450                try:
 8451
 8452                    # Query to update calculation column
 8453                    sql_update = f"""
 8454                        UPDATE {table_variants}
 8455                        SET "{prefix}{output_column_name}" = ({operation_query})
 8456                    """
 8457                    self.conn.execute(sql_update)
 8458
 8459                    # Add to INFO
 8460                    if operation_info:
 8461                        sql_update_info = f"""
 8462                            UPDATE {table_variants}
 8463                            SET "INFO" =
 8464                                concat(
 8465                                    CASE
 8466                                        WHEN "INFO" IS NOT NULL
 8467                                        THEN concat("INFO", ';')
 8468                                        ELSE ''
 8469                                    END,
 8470                                    '{output_column_name}=',
 8471                                    "{prefix}{output_column_name}"
 8472                                )
 8473                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8474                        """
 8475                        self.conn.execute(sql_update_info)
 8476
 8477                except:
 8478                    log.error(
 8479                        f"Operations config: Calculation '{operation_name}' query failed"
 8480                    )
 8481                    raise ValueError(
 8482                        f"Operations config: Calculation '{operation_name}' query failed"
 8483                    )
 8484
 8485                # Remove added columns
 8486                for added_column in added_columns:
 8487                    log.debug(f"added_column: {added_column}")
 8488                    self.drop_column(column=added_column)
 8489
 8490            else:
 8491                log.error(
 8492                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8493                )
 8494                raise ValueError(
 8495                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8496                )
 8497
 8498        else:
 8499            log.error(
 8500                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8501            )
 8502            raise ValueError(
 8503                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8504            )
 8505
 8506    def calculation_process_function(
 8507        self, operation: dict, operation_name: str = "unknown"
 8508    ) -> None:
 8509        """
 8510        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8511        function with the given parameters.
 8512
 8513        :param operation: The `operation` parameter is a dictionary that contains information about the
 8514        operation to be performed. It has the following keys:
 8515        :type operation: dict
 8516        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8517        the operation being performed. It is used for logging purposes, defaults to unknown
 8518        :type operation_name: str (optional)
 8519        """
 8520
 8521        operation_name = operation["name"]
 8522        log.debug(f"process Python {operation_name}")
 8523        function_name = operation["function_name"]
 8524        function_params = operation["function_params"]
 8525        getattr(self, function_name)(*function_params)
 8526
 8527    def calculation_variant_id(self) -> None:
 8528        """
 8529        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8530        updates the INFO field of a variants table with the variant ID.
 8531        """
 8532
 8533        # variant_id annotation field
 8534        variant_id_tag = self.get_variant_id_column()
 8535        added_columns = [variant_id_tag]
 8536
 8537        # variant_id hgvs tags"
 8538        vcf_infos_tags = {
 8539            variant_id_tag: "howard variant ID annotation",
 8540        }
 8541
 8542        # Variants table
 8543        table_variants = self.get_table_variants()
 8544
 8545        # Header
 8546        vcf_reader = self.get_header()
 8547
 8548        # Add variant_id to header
 8549        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8550            variant_id_tag,
 8551            ".",
 8552            "String",
 8553            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8554            "howard calculation",
 8555            "0",
 8556            self.code_type_map.get("String"),
 8557        )
 8558
 8559        # Update
 8560        sql_update = f"""
 8561            UPDATE {table_variants}
 8562            SET "INFO" = 
 8563                concat(
 8564                    CASE
 8565                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8566                        THEN ''
 8567                        ELSE concat("INFO", ';')
 8568                    END,
 8569                    '{variant_id_tag}=',
 8570                    "{variant_id_tag}"
 8571                )
 8572        """
 8573        self.conn.execute(sql_update)
 8574
 8575        # Remove added columns
 8576        for added_column in added_columns:
 8577            self.drop_column(column=added_column)
 8578
 8579    def calculation_extract_snpeff_hgvs(
 8580        self,
 8581        snpeff_hgvs: str = "snpeff_hgvs",
 8582        snpeff_field: str = "ANN",
 8583    ) -> None:
 8584        """
 8585        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8586        annotation field in a VCF file and adds them as a new column in the variants table.
 8587
 8588        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8589        function is used to specify the name of the column that will store the HGVS nomenclatures
 8590        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8591        snpeff_hgvs
 8592        :type snpeff_hgvs: str (optional)
 8593        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8594        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8595        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8596        to ANN
 8597        :type snpeff_field: str (optional)
 8598        """
 8599
 8600        # Snpeff hgvs tags
 8601        vcf_infos_tags = {
 8602            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8603        }
 8604
 8605        # Prefix
 8606        prefix = self.get_explode_infos_prefix()
 8607        if prefix:
 8608            prefix = "INFO/"
 8609
 8610        # snpEff fields
 8611        speff_ann_infos = prefix + snpeff_field
 8612        speff_hgvs_infos = prefix + snpeff_hgvs
 8613
 8614        # Variants table
 8615        table_variants = self.get_table_variants()
 8616
 8617        # Header
 8618        vcf_reader = self.get_header()
 8619
 8620        # Add columns
 8621        added_columns = []
 8622
 8623        # Explode HGVS field in column
 8624        added_columns += self.explode_infos(fields=[snpeff_field])
 8625
 8626        if snpeff_field in vcf_reader.infos:
 8627
 8628            log.debug(vcf_reader.infos[snpeff_field])
 8629
 8630            # Extract ANN header
 8631            ann_description = vcf_reader.infos[snpeff_field].desc
 8632            pattern = r"'(.+?)'"
 8633            match = re.search(pattern, ann_description)
 8634            if match:
 8635                ann_header_match = match.group(1).split(" | ")
 8636                ann_header_desc = {}
 8637                for i in range(len(ann_header_match)):
 8638                    ann_header_info = "".join(
 8639                        char for char in ann_header_match[i] if char.isalnum()
 8640                    )
 8641                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8642                if not ann_header_desc:
 8643                    raise ValueError("Invalid header description format")
 8644            else:
 8645                raise ValueError("Invalid header description format")
 8646
 8647            # Create variant id
 8648            variant_id_column = self.get_variant_id_column()
 8649            added_columns += [variant_id_column]
 8650
 8651            # Create dataframe
 8652            dataframe_snpeff_hgvs = self.get_query_to_df(
 8653                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8654            )
 8655
 8656            # Create main NOMEN column
 8657            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8658                speff_ann_infos
 8659            ].apply(
 8660                lambda x: extract_snpeff_hgvs(
 8661                    str(x), header=list(ann_header_desc.values())
 8662                )
 8663            )
 8664
 8665            # Add snpeff_hgvs to header
 8666            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8667                snpeff_hgvs,
 8668                ".",
 8669                "String",
 8670                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8671                "howard calculation",
 8672                "0",
 8673                self.code_type_map.get("String"),
 8674            )
 8675
 8676            # Update
 8677            sql_update = f"""
 8678                UPDATE variants
 8679                SET "INFO" = 
 8680                    concat(
 8681                        CASE
 8682                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8683                            THEN ''
 8684                            ELSE concat("INFO", ';')
 8685                        END,
 8686                        CASE 
 8687                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8688                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8689                            THEN concat(
 8690                                    '{snpeff_hgvs}=',
 8691                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8692                                )
 8693                            ELSE ''
 8694                        END
 8695                    )
 8696                FROM dataframe_snpeff_hgvs
 8697                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8698
 8699            """
 8700            self.conn.execute(sql_update)
 8701
 8702            # Delete dataframe
 8703            del dataframe_snpeff_hgvs
 8704            gc.collect()
 8705
 8706        else:
 8707
 8708            log.warning(
 8709                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8710            )
 8711
 8712        # Remove added columns
 8713        for added_column in added_columns:
 8714            self.drop_column(column=added_column)
 8715
 8716    def calculation_snpeff_ann_explode(
 8717        self,
 8718        uniquify: bool = True,
 8719        output_format: str = "fields",
 8720        output_prefix: str = "snpeff_",
 8721        snpeff_field: str = "ANN",
 8722    ) -> None:
 8723        """
 8724        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8725        exploding the HGVS field and updating variant information accordingly.
 8726
 8727        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8728        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8729        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8730        defaults to True
 8731        :type uniquify: bool (optional)
 8732        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8733        function specifies the format in which the output annotations will be generated. It has a
 8734        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8735        format, defaults to fields
 8736        :type output_format: str (optional)
 8737        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8738        method is used to specify the prefix that will be added to the output annotations generated
 8739        during the calculation process. This prefix helps to differentiate the newly added annotations
 8740        from existing ones in the output data. By default, the, defaults to ANN_
 8741        :type output_prefix: str (optional)
 8742        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8743        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8744        field will be processed to explode the HGVS annotations and update the variant information
 8745        accordingly, defaults to ANN
 8746        :type snpeff_field: str (optional)
 8747        """
 8748
 8749        # SnpEff annotation field
 8750        snpeff_hgvs = "snpeff_ann_explode"
 8751
 8752        # Snpeff hgvs tags
 8753        vcf_infos_tags = {
 8754            snpeff_hgvs: "Explode snpEff annotations",
 8755        }
 8756
 8757        # Prefix
 8758        prefix = self.get_explode_infos_prefix()
 8759        if prefix:
 8760            prefix = "INFO/"
 8761
 8762        # snpEff fields
 8763        speff_ann_infos = prefix + snpeff_field
 8764        speff_hgvs_infos = prefix + snpeff_hgvs
 8765
 8766        # Variants table
 8767        table_variants = self.get_table_variants()
 8768
 8769        # Header
 8770        vcf_reader = self.get_header()
 8771
 8772        # Add columns
 8773        added_columns = []
 8774
 8775        # Explode HGVS field in column
 8776        added_columns += self.explode_infos(fields=[snpeff_field])
 8777        log.debug(f"snpeff_field={snpeff_field}")
 8778        log.debug(f"added_columns={added_columns}")
 8779
 8780        if snpeff_field in vcf_reader.infos:
 8781
 8782            # Extract ANN header
 8783            ann_description = vcf_reader.infos[snpeff_field].desc
 8784            pattern = r"'(.+?)'"
 8785            match = re.search(pattern, ann_description)
 8786            if match:
 8787                ann_header_match = match.group(1).split(" | ")
 8788                ann_header = []
 8789                ann_header_desc = {}
 8790                for i in range(len(ann_header_match)):
 8791                    ann_header_info = "".join(
 8792                        char for char in ann_header_match[i] if char.isalnum()
 8793                    )
 8794                    ann_header.append(ann_header_info)
 8795                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8796                if not ann_header_desc:
 8797                    raise ValueError("Invalid header description format")
 8798            else:
 8799                raise ValueError("Invalid header description format")
 8800
 8801            # Create variant id
 8802            variant_id_column = self.get_variant_id_column()
 8803            added_columns += [variant_id_column]
 8804
 8805            # Create dataframe
 8806            dataframe_snpeff_hgvs = self.get_query_to_df(
 8807                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8808            )
 8809
 8810            # Create snpEff columns
 8811            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8812                speff_ann_infos
 8813            ].apply(
 8814                lambda x: explode_snpeff_ann(
 8815                    str(x),
 8816                    uniquify=uniquify,
 8817                    output_format=output_format,
 8818                    prefix=output_prefix,
 8819                    header=list(ann_header_desc.values()),
 8820                )
 8821            )
 8822
 8823            # Header
 8824            ann_annotations_prefix = ""
 8825            if output_format.upper() in ["JSON"]:
 8826                ann_annotations_prefix = f"{output_prefix}="
 8827                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8828                    output_prefix,
 8829                    ".",
 8830                    "String",
 8831                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8832                    + " - JSON format",
 8833                    "howard calculation",
 8834                    "0",
 8835                    self.code_type_map.get("String"),
 8836                )
 8837            else:
 8838                for ann_annotation in ann_header:
 8839                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8840                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8841                        ann_annotation_id,
 8842                        ".",
 8843                        "String",
 8844                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8845                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8846                        "howard calculation",
 8847                        "0",
 8848                        self.code_type_map.get("String"),
 8849                    )
 8850
 8851            # Update
 8852            sql_update = f"""
 8853                UPDATE variants
 8854                SET "INFO" = 
 8855                    concat(
 8856                        CASE
 8857                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8858                            THEN ''
 8859                            ELSE concat("INFO", ';')
 8860                        END,
 8861                        CASE 
 8862                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8863                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8864                            THEN concat(
 8865                                '{ann_annotations_prefix}',
 8866                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8867                                )
 8868                            ELSE ''
 8869                        END
 8870                    )
 8871                FROM dataframe_snpeff_hgvs
 8872                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8873
 8874            """
 8875            self.conn.execute(sql_update)
 8876
 8877            # Delete dataframe
 8878            del dataframe_snpeff_hgvs
 8879            gc.collect()
 8880
 8881        else:
 8882
 8883            log.warning(
 8884                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8885            )
 8886
 8887        # Remove added columns
 8888        for added_column in added_columns:
 8889            self.drop_column(column=added_column)
 8890
 8891    def calculation_extract_nomen(self) -> None:
 8892        """
 8893        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8894        """
 8895
 8896        # NOMEN field
 8897        field_nomen_dict = "NOMEN_DICT"
 8898
 8899        # NOMEN structure
 8900        nomen_dict = {
 8901            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8902            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8903            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8904            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8905            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8906            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8907            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8908            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8909            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8910            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8911        }
 8912
 8913        # Param
 8914        param = self.get_param()
 8915
 8916        # Threads
 8917        threads = self.get_threads()
 8918
 8919        # Prefix
 8920        prefix = self.get_explode_infos_prefix()
 8921
 8922        # Header
 8923        vcf_reader = self.get_header()
 8924
 8925        # Added columns
 8926        added_columns = []
 8927
 8928        # Get HGVS field
 8929        hgvs_field = (
 8930            param.get("calculation", {})
 8931            .get("calculations", {})
 8932            .get("NOMEN", {})
 8933            .get("options", {})
 8934            .get("hgvs_field", "hgvs")
 8935        )
 8936
 8937        # Get NOMEN pattern
 8938        nomen_pattern = (
 8939            param.get("calculation", {})
 8940            .get("calculations", {})
 8941            .get("NOMEN", {})
 8942            .get("options", {})
 8943            .get("pattern", None)
 8944        )
 8945
 8946        # transcripts list of preference sources
 8947        transcripts_sources = {}
 8948
 8949        # Get transcripts
 8950        transcripts_file = (
 8951            param.get("calculation", {})
 8952            .get("calculations", {})
 8953            .get("NOMEN", {})
 8954            .get("options", {})
 8955            .get("transcripts", None)
 8956        )
 8957        transcripts_file = full_path(transcripts_file)
 8958        if transcripts_file:
 8959            if os.path.exists(transcripts_file):
 8960                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8961                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8962                transcripts_sources["file"] = transcripts_from_file
 8963            else:
 8964                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8965                log.error(msg_err)
 8966                raise ValueError(msg_err)
 8967
 8968        # Get transcripts table
 8969        transcripts_table = (
 8970            param.get("calculation", {})
 8971            .get("calculations", {})
 8972            .get("NOMEN", {})
 8973            .get("options", {})
 8974            .get("transcripts_table", self.get_table_variants())
 8975        )
 8976        # Get transcripts column
 8977        transcripts_column = (
 8978            param.get("calculation", {})
 8979            .get("calculations", {})
 8980            .get("NOMEN", {})
 8981            .get("options", {})
 8982            .get("transcripts_column", None)
 8983        )
 8984
 8985        if transcripts_table and transcripts_column:
 8986            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8987            # Explode if not exists
 8988            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8989        else:
 8990            extra_field_transcript = f"NULL"
 8991
 8992        # Transcripts of preference source order
 8993        transcripts_order = (
 8994            param.get("calculation", {})
 8995            .get("calculations", {})
 8996            .get("NOMEN", {})
 8997            .get("options", {})
 8998            .get("transcripts_order", ["column", "file"])
 8999        )
 9000
 9001        # Transcripts from file
 9002        transcripts = transcripts_sources.get("file", [])
 9003
 9004        # Explode HGVS field in column
 9005        added_columns += self.explode_infos(fields=[hgvs_field])
 9006
 9007        # extra infos
 9008        extra_infos = self.get_extra_infos()
 9009        extra_field = prefix + hgvs_field
 9010
 9011        if extra_field in extra_infos:
 9012
 9013            # Create dataframe
 9014            dataframe_hgvs = self.get_query_to_df(
 9015                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9016            )
 9017
 9018            # Transcripts rank
 9019            transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)}
 9020            transcripts_len = len(transcripts_rank)
 9021
 9022            # Create main NOMEN column
 9023            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9024                lambda x: find_nomen(
 9025                    hgvs=x.hgvs,
 9026                    transcript=x.transcript,
 9027                    transcripts=transcripts_rank,
 9028                    pattern=nomen_pattern,
 9029                    transcripts_source_order=transcripts_order,
 9030                    transcripts_len=transcripts_len
 9031                ),
 9032                axis=1,
 9033            )
 9034
 9035            # Explode NOMEN Structure and create SQL set for update
 9036            sql_nomen_fields = []
 9037            for nomen_field in nomen_dict:
 9038
 9039                # Create VCF header field
 9040                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9041                    nomen_field,
 9042                    ".",
 9043                    "String",
 9044                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9045                    "howard calculation",
 9046                    "0",
 9047                    self.code_type_map.get("String"),
 9048                )
 9049
 9050                # Add field to SQL query update
 9051                sql_nomen_fields.append(
 9052                    f"""
 9053                        CASE 
 9054                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9055                            THEN concat(
 9056                                    ';{nomen_field}=',
 9057                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9058                                )
 9059                            ELSE ''
 9060                        END
 9061                    """
 9062                )
 9063
 9064            # SQL set for update
 9065            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9066
 9067            # Update
 9068            sql_update = f"""
 9069                UPDATE variants
 9070                SET "INFO" = 
 9071                    concat(
 9072                        CASE
 9073                            WHEN "INFO" IS NULL
 9074                            THEN ''
 9075                            ELSE "INFO"
 9076                        END,
 9077                        {sql_nomen_fields_set}
 9078                    )
 9079                FROM dataframe_hgvs
 9080                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9081                    AND variants."POS" = dataframe_hgvs."POS" 
 9082                    AND variants."REF" = dataframe_hgvs."REF"
 9083                    AND variants."ALT" = dataframe_hgvs."ALT"
 9084            """
 9085            self.conn.execute(sql_update)
 9086
 9087            # Delete dataframe
 9088            del dataframe_hgvs
 9089            gc.collect()
 9090
 9091        # Remove added columns
 9092        for added_column in added_columns:
 9093            self.drop_column(column=added_column)
 9094
 9095    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9096        """
 9097        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9098        pipeline/sample for a variant and updates the variant information in a VCF file.
 9099
 9100        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9101        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9102        VCF header and to update the corresponding field in the variants table, defaults to
 9103        findbypipeline
 9104        :type tag: str (optional)
 9105        """
 9106
 9107        # if FORMAT and samples
 9108        if (
 9109            "FORMAT" in self.get_header_columns_as_list()
 9110            and self.get_header_sample_list()
 9111        ):
 9112
 9113            # findbypipeline annotation field
 9114            findbypipeline_tag = tag
 9115
 9116            # VCF infos tags
 9117            vcf_infos_tags = {
 9118                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9119            }
 9120
 9121            # Prefix
 9122            prefix = self.get_explode_infos_prefix()
 9123
 9124            # Field
 9125            findbypipeline_infos = prefix + findbypipeline_tag
 9126
 9127            # Variants table
 9128            table_variants = self.get_table_variants()
 9129
 9130            # Header
 9131            vcf_reader = self.get_header()
 9132
 9133            # Create variant id
 9134            variant_id_column = self.get_variant_id_column()
 9135            added_columns = [variant_id_column]
 9136
 9137            # variant_id, FORMAT and samples
 9138            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9139                self.get_header_sample_list()
 9140            )
 9141
 9142            # Create dataframe
 9143            dataframe_findbypipeline = self.get_query_to_df(
 9144                f""" SELECT {samples_fields} FROM {table_variants} """
 9145            )
 9146
 9147            # Create findbypipeline column
 9148            dataframe_findbypipeline[findbypipeline_infos] = (
 9149                dataframe_findbypipeline.apply(
 9150                    lambda row: findbypipeline(
 9151                        row, samples=self.get_header_sample_list()
 9152                    ),
 9153                    axis=1,
 9154                )
 9155            )
 9156
 9157            # Add snpeff_hgvs to header
 9158            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9159                findbypipeline_tag,
 9160                ".",
 9161                "String",
 9162                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9163                "howard calculation",
 9164                "0",
 9165                self.code_type_map.get("String"),
 9166            )
 9167
 9168            # Update
 9169            sql_update = f"""
 9170                UPDATE variants
 9171                SET "INFO" = 
 9172                    concat(
 9173                        CASE
 9174                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9175                            THEN ''
 9176                            ELSE concat("INFO", ';')
 9177                        END,
 9178                        CASE 
 9179                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9180                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9181                            THEN concat(
 9182                                    '{findbypipeline_tag}=',
 9183                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9184                                )
 9185                            ELSE ''
 9186                        END
 9187                    )
 9188                FROM dataframe_findbypipeline
 9189                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9190            """
 9191            self.conn.execute(sql_update)
 9192
 9193            # Remove added columns
 9194            for added_column in added_columns:
 9195                self.drop_column(column=added_column)
 9196
 9197            # Delete dataframe
 9198            del dataframe_findbypipeline
 9199            gc.collect()
 9200
 9201    def calculation_genotype_concordance(self) -> None:
 9202        """
 9203        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9204        multi-caller VCF files and updates the variant information in the database.
 9205        """
 9206
 9207        # if FORMAT and samples
 9208        if (
 9209            "FORMAT" in self.get_header_columns_as_list()
 9210            and self.get_header_sample_list()
 9211        ):
 9212
 9213            # genotypeconcordance annotation field
 9214            genotypeconcordance_tag = "genotypeconcordance"
 9215
 9216            # VCF infos tags
 9217            vcf_infos_tags = {
 9218                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9219            }
 9220
 9221            # Prefix
 9222            prefix = self.get_explode_infos_prefix()
 9223
 9224            # Field
 9225            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9226
 9227            # Variants table
 9228            table_variants = self.get_table_variants()
 9229
 9230            # Header
 9231            vcf_reader = self.get_header()
 9232
 9233            # Create variant id
 9234            variant_id_column = self.get_variant_id_column()
 9235            added_columns = [variant_id_column]
 9236
 9237            # variant_id, FORMAT and samples
 9238            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9239                self.get_header_sample_list()
 9240            )
 9241
 9242            # Create dataframe
 9243            dataframe_genotypeconcordance = self.get_query_to_df(
 9244                f""" SELECT {samples_fields} FROM {table_variants} """
 9245            )
 9246
 9247            # Create genotypeconcordance column
 9248            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9249                dataframe_genotypeconcordance.apply(
 9250                    lambda row: genotypeconcordance(
 9251                        row, samples=self.get_header_sample_list()
 9252                    ),
 9253                    axis=1,
 9254                )
 9255            )
 9256
 9257            # Add genotypeconcordance to header
 9258            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9259                genotypeconcordance_tag,
 9260                ".",
 9261                "String",
 9262                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9263                "howard calculation",
 9264                "0",
 9265                self.code_type_map.get("String"),
 9266            )
 9267
 9268            # Update
 9269            sql_update = f"""
 9270                UPDATE variants
 9271                SET "INFO" = 
 9272                    concat(
 9273                        CASE
 9274                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9275                            THEN ''
 9276                            ELSE concat("INFO", ';')
 9277                        END,
 9278                        CASE
 9279                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9280                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9281                            THEN concat(
 9282                                    '{genotypeconcordance_tag}=',
 9283                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9284                                )
 9285                            ELSE ''
 9286                        END
 9287                    )
 9288                FROM dataframe_genotypeconcordance
 9289                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9290            """
 9291            self.conn.execute(sql_update)
 9292
 9293            # Remove added columns
 9294            for added_column in added_columns:
 9295                self.drop_column(column=added_column)
 9296
 9297            # Delete dataframe
 9298            del dataframe_genotypeconcordance
 9299            gc.collect()
 9300
 9301    def calculation_barcode(self, tag: str = "barcode") -> None:
 9302        """
 9303        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9304        updates the INFO field in the file with the calculated barcode values.
 9305
 9306        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9307        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9308        the default tag name is set to "barcode", defaults to barcode
 9309        :type tag: str (optional)
 9310        """
 9311
 9312        # if FORMAT and samples
 9313        if (
 9314            "FORMAT" in self.get_header_columns_as_list()
 9315            and self.get_header_sample_list()
 9316        ):
 9317
 9318            # barcode annotation field
 9319            if not tag:
 9320                tag = "barcode"
 9321
 9322            # VCF infos tags
 9323            vcf_infos_tags = {
 9324                tag: "barcode calculation (VaRank)",
 9325            }
 9326
 9327            # Prefix
 9328            prefix = self.get_explode_infos_prefix()
 9329
 9330            # Field
 9331            barcode_infos = prefix + tag
 9332
 9333            # Variants table
 9334            table_variants = self.get_table_variants()
 9335
 9336            # Header
 9337            vcf_reader = self.get_header()
 9338
 9339            # Create variant id
 9340            variant_id_column = self.get_variant_id_column()
 9341            added_columns = [variant_id_column]
 9342
 9343            # variant_id, FORMAT and samples
 9344            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9345                self.get_header_sample_list()
 9346            )
 9347
 9348            # Create dataframe
 9349            dataframe_barcode = self.get_query_to_df(
 9350                f""" SELECT {samples_fields} FROM {table_variants} """
 9351            )
 9352
 9353            # Create barcode column
 9354            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9355                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9356            )
 9357
 9358            # Add barcode to header
 9359            vcf_reader.infos[tag] = vcf.parser._Info(
 9360                tag,
 9361                ".",
 9362                "String",
 9363                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9364                "howard calculation",
 9365                "0",
 9366                self.code_type_map.get("String"),
 9367            )
 9368
 9369            # Update
 9370            sql_update = f"""
 9371                UPDATE {table_variants}
 9372                SET "INFO" = 
 9373                    concat(
 9374                        CASE
 9375                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9376                            THEN ''
 9377                            ELSE concat("INFO", ';')
 9378                        END,
 9379                        CASE
 9380                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9381                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9382                            THEN concat(
 9383                                    '{tag}=',
 9384                                    dataframe_barcode."{barcode_infos}"
 9385                                )
 9386                            ELSE ''
 9387                        END
 9388                    )
 9389                FROM dataframe_barcode
 9390                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9391            """
 9392            self.conn.execute(sql_update)
 9393
 9394            # Remove added columns
 9395            for added_column in added_columns:
 9396                self.drop_column(column=added_column)
 9397
 9398            # Delete dataframe
 9399            del dataframe_barcode
 9400            gc.collect()
 9401
 9402    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9403        """
 9404        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9405        and updates the INFO field in the file with the calculated barcode values.
 9406
 9407        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9408        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9409        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9410        :type tag: str (optional)
 9411        """
 9412
 9413        # if FORMAT and samples
 9414        if (
 9415            "FORMAT" in self.get_header_columns_as_list()
 9416            and self.get_header_sample_list()
 9417        ):
 9418
 9419            # barcode annotation field
 9420            if not tag:
 9421                tag = "BCF"
 9422
 9423            # VCF infos tags
 9424            vcf_infos_tags = {
 9425                tag: "barcode family calculation",
 9426                f"{tag}S": "barcode family samples",
 9427            }
 9428
 9429            # Param
 9430            param = self.get_param()
 9431            log.debug(f"param={param}")
 9432
 9433            # Prefix
 9434            prefix = self.get_explode_infos_prefix()
 9435
 9436            # PED param
 9437            ped = (
 9438                param.get("calculation", {})
 9439                .get("calculations", {})
 9440                .get("BARCODEFAMILY", {})
 9441                .get("family_pedigree", None)
 9442            )
 9443            log.debug(f"ped={ped}")
 9444
 9445            # Load PED
 9446            if ped:
 9447
 9448                # Pedigree is a file
 9449                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9450                    log.debug("Pedigree is file")
 9451                    with open(full_path(ped)) as ped:
 9452                        ped = yaml.safe_load(ped)
 9453
 9454                # Pedigree is a string
 9455                elif isinstance(ped, str):
 9456                    log.debug("Pedigree is str")
 9457                    try:
 9458                        ped = json.loads(ped)
 9459                        log.debug("Pedigree is json str")
 9460                    except ValueError as e:
 9461                        ped_samples = ped.split(",")
 9462                        ped = {}
 9463                        for ped_sample in ped_samples:
 9464                            ped[ped_sample] = ped_sample
 9465
 9466                # Pedigree is a dict
 9467                elif isinstance(ped, dict):
 9468                    log.debug("Pedigree is dict")
 9469
 9470                # Pedigree is not well formatted
 9471                else:
 9472                    msg_error = "Pedigree not well formatted"
 9473                    log.error(msg_error)
 9474                    raise ValueError(msg_error)
 9475
 9476                # Construct list
 9477                ped_samples = list(ped.values())
 9478
 9479            else:
 9480                log.debug("Pedigree not defined. Take all samples")
 9481                ped_samples = self.get_header_sample_list()
 9482                ped = {}
 9483                for ped_sample in ped_samples:
 9484                    ped[ped_sample] = ped_sample
 9485
 9486            # Check pedigree
 9487            if not ped or len(ped) == 0:
 9488                msg_error = f"Error in pedigree: samples {ped_samples}"
 9489                log.error(msg_error)
 9490                raise ValueError(msg_error)
 9491
 9492            # Log
 9493            log.info(
 9494                "Calculation 'BARCODEFAMILY' - Samples: "
 9495                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9496            )
 9497            log.debug(f"ped_samples={ped_samples}")
 9498
 9499            # Field
 9500            barcode_infos = prefix + tag
 9501
 9502            # Variants table
 9503            table_variants = self.get_table_variants()
 9504
 9505            # Header
 9506            vcf_reader = self.get_header()
 9507
 9508            # Create variant id
 9509            variant_id_column = self.get_variant_id_column()
 9510            added_columns = [variant_id_column]
 9511
 9512            # variant_id, FORMAT and samples
 9513            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9514                ped_samples
 9515            )
 9516
 9517            # Create dataframe
 9518            dataframe_barcode = self.get_query_to_df(
 9519                f""" SELECT {samples_fields} FROM {table_variants} """
 9520            )
 9521
 9522            # Create barcode column
 9523            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9524                lambda row: barcode(row, samples=ped_samples), axis=1
 9525            )
 9526
 9527            # Add barcode family to header
 9528            # Add vaf_normalization to header
 9529            vcf_reader.formats[tag] = vcf.parser._Format(
 9530                id=tag,
 9531                num=".",
 9532                type="String",
 9533                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9534                type_code=self.code_type_map.get("String"),
 9535            )
 9536            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9537                id=f"{tag}S",
 9538                num=".",
 9539                type="String",
 9540                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9541                type_code=self.code_type_map.get("String"),
 9542            )
 9543
 9544            # Update
 9545            # for sample in ped_samples:
 9546            sql_update_set = []
 9547            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9548                if sample in ped_samples:
 9549                    value = f'dataframe_barcode."{barcode_infos}"'
 9550                    value_samples = "'" + ",".join(ped_samples) + "'"
 9551                elif sample == "FORMAT":
 9552                    value = f"'{tag}'"
 9553                    value_samples = f"'{tag}S'"
 9554                else:
 9555                    value = "'.'"
 9556                    value_samples = "'.'"
 9557                format_regex = r"[a-zA-Z0-9\s]"
 9558                sql_update_set.append(
 9559                    f"""
 9560                        "{sample}" = 
 9561                        concat(
 9562                            CASE
 9563                                WHEN {table_variants}."{sample}" = './.'
 9564                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9565                                ELSE {table_variants}."{sample}"
 9566                            END,
 9567                            ':',
 9568                            {value},
 9569                            ':',
 9570                            {value_samples}
 9571                        )
 9572                    """
 9573                )
 9574
 9575            sql_update_set_join = ", ".join(sql_update_set)
 9576            sql_update = f"""
 9577                UPDATE {table_variants}
 9578                SET {sql_update_set_join}
 9579                FROM dataframe_barcode
 9580                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9581            """
 9582            self.conn.execute(sql_update)
 9583
 9584            # Remove added columns
 9585            for added_column in added_columns:
 9586                self.drop_column(column=added_column)
 9587
 9588            # Delete dataframe
 9589            del dataframe_barcode
 9590            gc.collect()
 9591
 9592    def calculation_trio(self) -> None:
 9593        """
 9594        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9595        information to the INFO field of each variant.
 9596        """
 9597
 9598        # if FORMAT and samples
 9599        if (
 9600            "FORMAT" in self.get_header_columns_as_list()
 9601            and self.get_header_sample_list()
 9602        ):
 9603
 9604            # trio annotation field
 9605            trio_tag = "trio"
 9606
 9607            # VCF infos tags
 9608            vcf_infos_tags = {
 9609                "trio": "trio calculation",
 9610            }
 9611
 9612            # Param
 9613            param = self.get_param()
 9614
 9615            # Prefix
 9616            prefix = self.get_explode_infos_prefix()
 9617
 9618            # Trio param
 9619            trio_ped = (
 9620                param.get("calculation", {})
 9621                .get("calculations", {})
 9622                .get("TRIO", {})
 9623                .get("trio_pedigree", None)
 9624            )
 9625
 9626            # Load trio
 9627            if trio_ped:
 9628
 9629                # Trio pedigree is a file
 9630                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9631                    log.debug("TRIO pedigree is file")
 9632                    with open(full_path(trio_ped)) as trio_ped:
 9633                        trio_ped = yaml.safe_load(trio_ped)
 9634
 9635                # Trio pedigree is a string
 9636                elif isinstance(trio_ped, str):
 9637                    log.debug("TRIO pedigree is str")
 9638                    try:
 9639                        trio_ped = json.loads(trio_ped)
 9640                        log.debug("TRIO pedigree is json str")
 9641                    except ValueError as e:
 9642                        trio_samples = trio_ped.split(",")
 9643                        if len(trio_samples) == 3:
 9644                            trio_ped = {
 9645                                "father": trio_samples[0],
 9646                                "mother": trio_samples[1],
 9647                                "child": trio_samples[2],
 9648                            }
 9649                            log.debug("TRIO pedigree is list str")
 9650                        else:
 9651                            msg_error = "TRIO pedigree not well formatted"
 9652                            log.error(msg_error)
 9653                            raise ValueError(msg_error)
 9654
 9655                # Trio pedigree is a dict
 9656                elif isinstance(trio_ped, dict):
 9657                    log.debug("TRIO pedigree is dict")
 9658
 9659                # Trio pedigree is not well formatted
 9660                else:
 9661                    msg_error = "TRIO pedigree not well formatted"
 9662                    log.error(msg_error)
 9663                    raise ValueError(msg_error)
 9664
 9665                # Construct trio list
 9666                trio_samples = [
 9667                    trio_ped.get("father", ""),
 9668                    trio_ped.get("mother", ""),
 9669                    trio_ped.get("child", ""),
 9670                ]
 9671
 9672            else:
 9673                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9674                samples_list = self.get_header_sample_list()
 9675                if len(samples_list) >= 3:
 9676                    trio_samples = self.get_header_sample_list()[0:3]
 9677                    trio_ped = {
 9678                        "father": trio_samples[0],
 9679                        "mother": trio_samples[1],
 9680                        "child": trio_samples[2],
 9681                    }
 9682                else:
 9683                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9684                    log.error(msg_error)
 9685                    raise ValueError(msg_error)
 9686
 9687            # Check trio pedigree
 9688            if not trio_ped or len(trio_ped) != 3:
 9689                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9690                log.error(msg_error)
 9691                raise ValueError(msg_error)
 9692
 9693            # Log
 9694            log.info(
 9695                f"Calculation 'TRIO' - Samples: "
 9696                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9697            )
 9698
 9699            # Field
 9700            trio_infos = prefix + trio_tag
 9701
 9702            # Variants table
 9703            table_variants = self.get_table_variants()
 9704
 9705            # Header
 9706            vcf_reader = self.get_header()
 9707
 9708            # Create variant id
 9709            variant_id_column = self.get_variant_id_column()
 9710            added_columns = [variant_id_column]
 9711
 9712            # variant_id, FORMAT and samples
 9713            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9714                self.get_header_sample_list()
 9715            )
 9716
 9717            # Create dataframe
 9718            dataframe_trio = self.get_query_to_df(
 9719                f""" SELECT {samples_fields} FROM {table_variants} """
 9720            )
 9721
 9722            # Create trio column
 9723            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9724                lambda row: trio(row, samples=trio_samples), axis=1
 9725            )
 9726
 9727            # Add trio to header
 9728            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9729                trio_tag,
 9730                ".",
 9731                "String",
 9732                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9733                "howard calculation",
 9734                "0",
 9735                self.code_type_map.get("String"),
 9736            )
 9737
 9738            # Update
 9739            sql_update = f"""
 9740                UPDATE {table_variants}
 9741                SET "INFO" = 
 9742                    concat(
 9743                        CASE
 9744                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9745                            THEN ''
 9746                            ELSE concat("INFO", ';')
 9747                        END,
 9748                        CASE
 9749                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9750                             AND dataframe_trio."{trio_infos}" NOT NULL
 9751                            THEN concat(
 9752                                    '{trio_tag}=',
 9753                                    dataframe_trio."{trio_infos}"
 9754                                )
 9755                            ELSE ''
 9756                        END
 9757                    )
 9758                FROM dataframe_trio
 9759                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9760            """
 9761            self.conn.execute(sql_update)
 9762
 9763            # Remove added columns
 9764            for added_column in added_columns:
 9765                self.drop_column(column=added_column)
 9766
 9767            # Delete dataframe
 9768            del dataframe_trio
 9769            gc.collect()
 9770
 9771    def calculation_vaf_normalization(self) -> None:
 9772        """
 9773        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9774        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9775        :return: The function does not return anything.
 9776        """
 9777
 9778        # if FORMAT and samples
 9779        if (
 9780            "FORMAT" in self.get_header_columns_as_list()
 9781            and self.get_header_sample_list()
 9782        ):
 9783
 9784            # vaf_normalization annotation field
 9785            vaf_normalization_tag = "VAF"
 9786
 9787            # VCF infos tags
 9788            vcf_infos_tags = {
 9789                "VAF": "VAF Variant Frequency",
 9790            }
 9791
 9792            # Prefix
 9793            prefix = self.get_explode_infos_prefix()
 9794
 9795            # Variants table
 9796            table_variants = self.get_table_variants()
 9797
 9798            # Header
 9799            vcf_reader = self.get_header()
 9800
 9801            # Do not calculate if VAF already exists
 9802            if "VAF" in vcf_reader.formats:
 9803                log.debug("VAF already on genotypes")
 9804                return
 9805
 9806            # Create variant id
 9807            variant_id_column = self.get_variant_id_column()
 9808            added_columns = [variant_id_column]
 9809
 9810            # variant_id, FORMAT and samples
 9811            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9812                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9813            )
 9814
 9815            # Create dataframe
 9816            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9817            log.debug(f"query={query}")
 9818            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9819
 9820            vaf_normalization_set = []
 9821
 9822            # for each sample vaf_normalization
 9823            for sample in self.get_header_sample_list():
 9824                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9825                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9826                )
 9827                vaf_normalization_set.append(
 9828                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9829                )
 9830
 9831            # Add VAF to FORMAT
 9832            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9833                "FORMAT"
 9834            ].apply(lambda x: str(x) + ":VAF")
 9835            vaf_normalization_set.append(
 9836                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9837            )
 9838
 9839            # Add vaf_normalization to header
 9840            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9841                id=vaf_normalization_tag,
 9842                num="1",
 9843                type="Float",
 9844                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9845                type_code=self.code_type_map.get("Float"),
 9846            )
 9847
 9848            # Create fields to add in INFO
 9849            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9850
 9851            # Update
 9852            sql_update = f"""
 9853                UPDATE {table_variants}
 9854                SET {sql_vaf_normalization_set}
 9855                FROM dataframe_vaf_normalization
 9856                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9857
 9858            """
 9859            self.conn.execute(sql_update)
 9860
 9861            # Remove added columns
 9862            for added_column in added_columns:
 9863                self.drop_column(column=added_column)
 9864
 9865            # Delete dataframe
 9866            del dataframe_vaf_normalization
 9867            gc.collect()
 9868
 9869    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9870        """
 9871        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9872        field in a VCF file and updates the INFO column of the variants table with the calculated
 9873        statistics.
 9874
 9875        :param info: The `info` parameter is a string that represents the type of information for which
 9876        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9877        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9878        maximum value, the mean, the median, defaults to VAF
 9879        :type info: str (optional)
 9880        """
 9881
 9882        # if FORMAT and samples
 9883        if (
 9884            "FORMAT" in self.get_header_columns_as_list()
 9885            and self.get_header_sample_list()
 9886        ):
 9887
 9888            # vaf_stats annotation field
 9889            vaf_stats_tag = info + "_stats"
 9890
 9891            # VCF infos tags
 9892            vcf_infos_tags = {
 9893                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9894                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9895                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9896                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9897                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9898                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9899                info
 9900                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9901            }
 9902
 9903            # Prefix
 9904            prefix = self.get_explode_infos_prefix()
 9905
 9906            # Field
 9907            vaf_stats_infos = prefix + vaf_stats_tag
 9908
 9909            # Variants table
 9910            table_variants = self.get_table_variants()
 9911
 9912            # Header
 9913            vcf_reader = self.get_header()
 9914
 9915            # Create variant id
 9916            variant_id_column = self.get_variant_id_column()
 9917            added_columns = [variant_id_column]
 9918
 9919            # variant_id, FORMAT and samples
 9920            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9921                self.get_header_sample_list()
 9922            )
 9923
 9924            # Create dataframe
 9925            dataframe_vaf_stats = self.get_query_to_df(
 9926                f""" SELECT {samples_fields} FROM {table_variants} """
 9927            )
 9928
 9929            # Create vaf_stats column
 9930            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9931                lambda row: genotype_stats(
 9932                    row, samples=self.get_header_sample_list(), info=info
 9933                ),
 9934                axis=1,
 9935            )
 9936
 9937            # List of vcf tags
 9938            sql_vaf_stats_fields = []
 9939
 9940            # Check all VAF stats infos
 9941            for stat in vcf_infos_tags:
 9942
 9943                # Extract stats
 9944                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9945                    lambda x: dict(x).get(stat, "")
 9946                )
 9947
 9948                # Add snpeff_hgvs to header
 9949                vcf_reader.infos[stat] = vcf.parser._Info(
 9950                    stat,
 9951                    ".",
 9952                    "String",
 9953                    vcf_infos_tags.get(stat, "genotype statistics"),
 9954                    "howard calculation",
 9955                    "0",
 9956                    self.code_type_map.get("String"),
 9957                )
 9958
 9959                if len(sql_vaf_stats_fields):
 9960                    sep = ";"
 9961                else:
 9962                    sep = ""
 9963
 9964                # Create fields to add in INFO
 9965                sql_vaf_stats_fields.append(
 9966                    f"""
 9967                        CASE
 9968                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9969                            THEN concat(
 9970                                    '{sep}{stat}=',
 9971                                    dataframe_vaf_stats."{stat}"
 9972                                )
 9973                            ELSE ''
 9974                        END
 9975                    """
 9976                )
 9977
 9978            # SQL set for update
 9979            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9980
 9981            # Update
 9982            sql_update = f"""
 9983                UPDATE {table_variants}
 9984                SET "INFO" = 
 9985                    concat(
 9986                        CASE
 9987                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9988                            THEN ''
 9989                            ELSE concat("INFO", ';')
 9990                        END,
 9991                        {sql_vaf_stats_fields_set}
 9992                    )
 9993                FROM dataframe_vaf_stats
 9994                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9995
 9996            """
 9997            self.conn.execute(sql_update)
 9998
 9999            # Remove added columns
10000            for added_column in added_columns:
10001                self.drop_column(column=added_column)
10002
10003            # Delete dataframe
10004            del dataframe_vaf_stats
10005            gc.collect()
10006
10007    def calculation_transcripts_annotation(
10008        self, info_json: str = None, info_format: str = None
10009    ) -> None:
10010        """
10011        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10012        field to it if transcripts are available.
10013
10014        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10015        is a string parameter that represents the information field to be used in the transcripts JSON.
10016        It is used to specify the JSON format for the transcripts information. If no value is provided
10017        when calling the method, it defaults to "
10018        :type info_json: str
10019        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10020        method is a string parameter that specifies the format of the information field to be used in
10021        the transcripts JSON. It is used to define the format of the information field
10022        :type info_format: str
10023        """
10024
10025        # Create transcripts table
10026        transcripts_table = self.create_transcript_view()
10027
10028        # Add info field
10029        if transcripts_table:
10030            self.transcript_view_to_variants(
10031                transcripts_table=transcripts_table,
10032                transcripts_info_field_json=info_json,
10033                transcripts_info_field_format=info_format,
10034            )
10035        else:
10036            log.info("No Transcripts to process. Check param.json file configuration")
10037
10038    def calculation_transcripts_prioritization(self) -> None:
10039        """
10040        The function `calculation_transcripts_prioritization` creates a transcripts table and
10041        prioritizes transcripts based on certain criteria.
10042        """
10043
10044        # Create transcripts table
10045        transcripts_table = self.create_transcript_view()
10046
10047        # Add info field
10048        if transcripts_table:
10049            self.transcripts_prioritization(transcripts_table=transcripts_table)
10050        else:
10051            log.info("No Transcripts to process. Check param.json file configuration")
10052
10053    def calculation_transcripts_export(self) -> None:
10054        """ """
10055
10056        # Create transcripts table
10057        transcripts_table = self.create_transcript_view()
10058
10059        # Add info field
10060        if transcripts_table:
10061            self.transcripts_export(transcripts_table=transcripts_table)
10062        else:
10063            log.info("No Transcripts to process. Check param.json file configuration")
10064
10065    ###############
10066    # Transcripts #
10067    ###############
10068
10069    def transcripts_export(
10070        self, transcripts_table: str = None, param: dict = {}
10071    ) -> bool:
10072        """ """
10073
10074        log.debug("Start transcripts export...")
10075
10076        # Param
10077        if not param:
10078            param = self.get_param()
10079
10080        # Param export
10081        param_transcript_export = param.get("transcripts", {}).get("export", {})
10082
10083        # Output file
10084        transcripts_export_output = param_transcript_export.get("output", None)
10085
10086        if not param_transcript_export or not transcripts_export_output:
10087            log.warning(f"No transcriipts export parameters defined!")
10088            return False
10089
10090        # List of transcripts annotations
10091        query_describe = f"""
10092            SELECT column_name
10093            FROM (
10094                    DESCRIBE SELECT * FROM {transcripts_table}
10095                )
10096            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10097        """
10098        transcripts_annotations_list = list(
10099            self.get_query_to_df(query=query_describe)["column_name"]
10100        )
10101
10102        # Create transcripts table for export
10103        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10104            random.choices(string.ascii_uppercase + string.digits, k=10)
10105        )
10106        query_create_transcripts_table_export = f"""
10107            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10108        """
10109        self.execute_query(query=query_create_transcripts_table_export)
10110
10111        # Output file format
10112        transcripts_export_output_format = get_file_format(
10113            filename=transcripts_export_output
10114        )
10115
10116        # Format VCF - construct INFO
10117        if transcripts_export_output_format in ["vcf"]:
10118
10119            # Construct query update INFO and header
10120            query_update_info = []
10121            for field in transcripts_annotations_list:
10122
10123                # If field not in header
10124                if field not in self.get_header_infos_list():
10125
10126                    # Add PZ Transcript in header
10127                    self.get_header().infos[field] = vcf.parser._Info(
10128                        field,
10129                        ".",
10130                        "String",
10131                        f"Annotation '{field}' from transcript view",
10132                        "unknown",
10133                        "unknown",
10134                        0,
10135                    )
10136
10137                # Add field as INFO/tag
10138                query_update_info.append(
10139                    f"""
10140                        CASE
10141                            WHEN "{field}" IS NOT NULL
10142                            THEN concat('{field}=', "{field}", ';')    
10143                            ELSE ''     
10144                        END
10145                        """
10146                )
10147
10148            # Query param
10149            query_update_info_value = (
10150                f""" concat('',  {", ".join(query_update_info)}) """
10151            )
10152            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10153
10154        else:
10155
10156            # Query param
10157            query_update_info_value = f""" NULL """
10158            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10159
10160        # Update query INFO column
10161        query_update = f"""
10162            UPDATE {transcripts_table_export}
10163            SET INFO = {query_update_info_value}
10164
10165        """
10166        self.execute_query(query=query_update)
10167
10168        # Export
10169        self.export_output(
10170            output_file=transcripts_export_output,
10171            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10172        )
10173
10174        # Drop transcripts export table
10175        query_drop_transcripts_table_export = f"""
10176            DROP TABLE {transcripts_table_export}
10177        """
10178        self.execute_query(query=query_drop_transcripts_table_export)
10179
10180    def transcripts_prioritization(
10181        self, transcripts_table: str = None, param: dict = {}
10182    ) -> bool:
10183        """
10184        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10185        and updates the variants table with the prioritized information.
10186
10187        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10188        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10189        This parameter is used to identify the table where the transcripts data is stored for the
10190        prioritization process
10191        :type transcripts_table: str
10192        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10193        that contains various configuration settings for the prioritization process of transcripts. It
10194        is used to customize the behavior of the prioritization algorithm and includes settings such as
10195        the prefix for prioritization fields, default profiles, and other
10196        :type param: dict
10197        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10198        transcripts prioritization process is successfully completed, and `False` if there are any
10199        issues or if no profile is defined for transcripts prioritization.
10200        """
10201
10202        log.debug("Start transcripts prioritization...")
10203
10204        # Param
10205        if not param:
10206            param = self.get_param()
10207
10208        # Variants table
10209        table_variants = self.get_table_variants()
10210
10211        # Transcripts table
10212        if transcripts_table is None:
10213            transcripts_table = self.create_transcript_view(
10214                transcripts_table="transcripts", param=param
10215            )
10216        if transcripts_table is None:
10217            msg_err = "No Transcripts table availalble"
10218            log.error(msg_err)
10219            raise ValueError(msg_err)
10220        log.debug(f"transcripts_table={transcripts_table}")
10221
10222        # Get transcripts columns
10223        columns_as_list_query = f"""
10224            DESCRIBE {transcripts_table}
10225        """
10226        columns_as_list = list(
10227            self.get_query_to_df(columns_as_list_query)["column_name"]
10228        )
10229
10230        # Create INFO if not exists
10231        if "INFO" not in columns_as_list:
10232            query_add_info = f"""
10233                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10234            """
10235            self.execute_query(query_add_info)
10236
10237        # Prioritization param and Force only PZ Score and Flag
10238        pz_param = param.get("transcripts", {}).get("prioritization", {})
10239
10240        # PZ profile by default
10241        pz_profile_default = (
10242            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10243        )
10244
10245        # Exit if no profile
10246        if pz_profile_default is None:
10247            log.warning("No profile defined for transcripts prioritization")
10248            return False
10249
10250        # PZ fields
10251        pz_param_pzfields = {}
10252
10253        # PZ field transcripts
10254        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10255
10256        # Add PZ Transcript in header
10257        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10258            pz_fields_transcripts,
10259            ".",
10260            "String",
10261            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10262            "unknown",
10263            "unknown",
10264            code_type_map["String"],
10265        )
10266
10267        # Mandatory fields
10268        pz_mandatory_fields_list = [
10269            "Score",
10270            "Flag",
10271            "Tags",
10272            "Comment",
10273            "Infos",
10274            "Class",
10275        ]
10276        pz_mandatory_fields = []
10277        for pz_mandatory_field in pz_mandatory_fields_list:
10278            pz_mandatory_fields.append(
10279                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10280            )
10281
10282        # PZ fields in param
10283        for pz_field in pz_param.get("pzfields", []):
10284            if pz_field in pz_mandatory_fields_list:
10285                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10286                    pz_param.get("pzprefix", "PTZ") + pz_field
10287                )
10288            else:
10289                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10290                pz_param_pzfields[pz_field] = pz_field_new
10291
10292                # Add PZ Transcript in header
10293                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10294                    pz_field_new,
10295                    ".",
10296                    "String",
10297                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10298                    "unknown",
10299                    "unknown",
10300                    code_type_map["String"],
10301                )
10302
10303        # PZ fields param
10304        pz_param["pzfields"] = pz_mandatory_fields
10305
10306        # Prioritization
10307        prioritization_result = self.prioritization(
10308            table=transcripts_table,
10309            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10310        )
10311        if not prioritization_result:
10312            log.warning("Transcripts prioritization not processed")
10313            return False
10314
10315        # PZ fields sql query
10316        query_update_select_list = []
10317        query_update_concat_list = []
10318        query_update_order_list = []
10319        for pz_param_pzfield in set(
10320            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10321        ):
10322            query_update_select_list.append(f" {pz_param_pzfield}, ")
10323
10324        for pz_param_pzfield in pz_param_pzfields:
10325            query_update_concat_list.append(
10326                f"""
10327                    , CASE 
10328                        WHEN {pz_param_pzfield} IS NOT NULL
10329                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10330                        ELSE ''
10331                    END
10332                """
10333            )
10334
10335        # Order by
10336        pz_orders = (
10337            param.get("transcripts", {})
10338            .get("prioritization", {})
10339            .get("prioritization_transcripts_order", {})
10340        )
10341        if not pz_orders:
10342            pz_orders = {
10343                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10344                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10345            }
10346        for pz_order in pz_orders:
10347            query_update_order_list.append(
10348                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10349            )
10350
10351        # Fields to explode
10352        fields_to_explode = (
10353            list(pz_param_pzfields.keys())
10354            + pz_mandatory_fields
10355            + list(pz_orders.keys())
10356        )
10357        # Remove transcript column as a specific transcript column
10358        if "transcript" in fields_to_explode:
10359            fields_to_explode.remove("transcript")
10360
10361        # Fields intranscripts table
10362        query_transcripts_table = f"""
10363            DESCRIBE SELECT * FROM {transcripts_table}
10364        """
10365        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10366
10367        # Check fields to explode
10368        for field_to_explode in fields_to_explode:
10369            if field_to_explode not in self.get_header_infos_list() + list(
10370                query_transcripts_table.column_name
10371            ):
10372                msg_err = f"INFO/{field_to_explode} NOT IN header"
10373                log.error(msg_err)
10374                raise ValueError(msg_err)
10375
10376        # Explode fields to explode
10377        self.explode_infos(
10378            table=transcripts_table,
10379            fields=fields_to_explode,
10380        )
10381
10382        # Transcript preference file
10383        transcripts_preference_file = (
10384            param.get("transcripts", {})
10385            .get("prioritization", {})
10386            .get("prioritization_transcripts", {})
10387        )
10388        transcripts_preference_file = full_path(transcripts_preference_file)
10389
10390        # Transcript preference forced
10391        transcript_preference_force = (
10392            param.get("transcripts", {})
10393            .get("prioritization", {})
10394            .get("prioritization_transcripts_force", False)
10395        )
10396        # Transcript version forced
10397        transcript_version_force = (
10398            param.get("transcripts", {})
10399            .get("prioritization", {})
10400            .get("prioritization_transcripts_version_force", False)
10401        )
10402
10403        # Transcripts Ranking
10404        if transcripts_preference_file:
10405
10406            # Transcripts file to dataframe
10407            if os.path.exists(transcripts_preference_file):
10408                transcripts_preference_dataframe = transcripts_file_to_df(
10409                    transcripts_preference_file
10410                )
10411            else:
10412                log.error(
10413                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10414                )
10415                raise ValueError(
10416                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10417                )
10418
10419            # Order by depending to transcript preference forcing
10420            if transcript_preference_force:
10421                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10422            else:
10423                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10424
10425            # Transcript columns joined depend on version consideration
10426            if transcript_version_force:
10427                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10428            else:
10429                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10430
10431            # Query ranking for update
10432            query_update_ranking = f"""
10433                SELECT
10434                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10435                    ROW_NUMBER() OVER (
10436                        PARTITION BY "#CHROM", POS, REF, ALT
10437                        ORDER BY {order_by}
10438                    ) AS rn
10439                FROM {transcripts_table}
10440                LEFT JOIN 
10441                    (
10442                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10443                        FROM transcripts_preference_dataframe
10444                    ) AS transcripts_preference
10445                ON {transcripts_version_join}
10446            """
10447
10448        else:
10449
10450            # Query ranking for update
10451            query_update_ranking = f"""
10452                SELECT
10453                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10454                    ROW_NUMBER() OVER (
10455                        PARTITION BY "#CHROM", POS, REF, ALT
10456                        ORDER BY {" , ".join(query_update_order_list)}
10457                    ) AS rn
10458                FROM {transcripts_table}
10459            """
10460
10461        # Export Transcripts prioritization infos to variants table
10462        query_update = f"""
10463            WITH RankedTranscripts AS (
10464                {query_update_ranking}
10465            )
10466            UPDATE {table_variants}
10467                SET
10468                INFO = CONCAT(CASE
10469                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10470                            THEN ''
10471                            ELSE concat("INFO", ';')
10472                        END,
10473                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10474                        )
10475            FROM
10476                RankedTranscripts
10477            WHERE
10478                rn = 1
10479                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10480                AND variants."POS" = RankedTranscripts."POS"
10481                AND variants."REF" = RankedTranscripts."REF"
10482                AND variants."ALT" = RankedTranscripts."ALT"     
10483        """
10484
10485        # log.debug(f"query_update={query_update}")
10486        self.execute_query(query=query_update)
10487
10488        # Return
10489        return True
10490
10491    def create_transcript_view_from_columns_map(
10492        self,
10493        transcripts_table: str = "transcripts",
10494        columns_maps: dict = {},
10495        added_columns: list = [],
10496        temporary_tables: list = None,
10497        annotation_fields: list = None,
10498        column_rename: dict = {},
10499        column_clean: bool = False,
10500        column_case: str = None,
10501    ) -> tuple[list, list, list]:
10502        """
10503        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10504        specified columns mapping for transcripts data.
10505
10506        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10507        of the table where the transcripts data is stored or will be stored in the database. This table
10508        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10509        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10510        :type transcripts_table: str (optional)
10511        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10512        about how to map columns from a transcripts table to create a view. Each entry in the
10513        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10514        typically includes details such as the main transcript column and additional information columns
10515        :type columns_maps: dict
10516        :param added_columns: The `added_columns` parameter in the
10517        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10518        that will be added to the view being created based on the columns map provided. These columns
10519        are generated by exploding the transcript information columns along with the main transcript
10520        column
10521        :type added_columns: list
10522        :param temporary_tables: The `temporary_tables` parameter in the
10523        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10524        tables created during the process of creating a transcript view from a columns map. These
10525        temporary tables are used to store intermediate results or transformations before the final view
10526        is generated
10527        :type temporary_tables: list
10528        :param annotation_fields: The `annotation_fields` parameter in the
10529        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10530        used for annotation in the query view creation process. These fields are extracted from the
10531        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10532        :type annotation_fields: list
10533        :param column_rename: The `column_rename` parameter in the
10534        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10535        custom renaming for columns during the creation of the temporary table view. This parameter
10536        provides a mapping of original column names to the desired renamed column names. By using this
10537        parameter,
10538        :type column_rename: dict
10539        :param column_clean: The `column_clean` parameter in the
10540        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10541        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10542        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10543        False
10544        :type column_clean: bool (optional)
10545        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10546        function is used to specify the case transformation to be applied to the columns during the view
10547        creation process. It allows you to control whether the column values should be converted to
10548        lowercase, uppercase, or remain unchanged
10549        :type column_case: str
10550        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10551        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10552        """
10553
10554        log.debug("Start transcrpts view creation from columns map...")
10555
10556        # "from_columns_map": [
10557        #     {
10558        #         "transcripts_column": "Ensembl_transcriptid",
10559        #         "transcripts_infos_columns": [
10560        #             "genename",
10561        #             "Ensembl_geneid",
10562        #             "LIST_S2_score",
10563        #             "LIST_S2_pred",
10564        #         ],
10565        #     },
10566        #     {
10567        #         "transcripts_column": "Ensembl_transcriptid",
10568        #         "transcripts_infos_columns": [
10569        #             "genename",
10570        #             "VARITY_R_score",
10571        #             "Aloft_pred",
10572        #         ],
10573        #     },
10574        # ],
10575
10576        # Init
10577        if temporary_tables is None:
10578            temporary_tables = []
10579        if annotation_fields is None:
10580            annotation_fields = []
10581
10582        # Variants table
10583        table_variants = self.get_table_variants()
10584
10585        for columns_map in columns_maps:
10586
10587            # Transcript column
10588            transcripts_column = columns_map.get("transcripts_column", None)
10589
10590            # Transcripts infos columns
10591            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10592
10593            # Transcripts infos columns rename
10594            column_rename = columns_map.get("column_rename", column_rename)
10595
10596            # Transcripts infos columns clean
10597            column_clean = columns_map.get("column_clean", column_clean)
10598
10599            # Transcripts infos columns case
10600            column_case = columns_map.get("column_case", column_case)
10601
10602            if transcripts_column is not None:
10603
10604                # Explode
10605                added_columns += self.explode_infos(
10606                    fields=[transcripts_column] + transcripts_infos_columns
10607                )
10608
10609                # View clauses
10610                clause_select_variants = []
10611                clause_select_tanscripts = []
10612                for field in [transcripts_column] + transcripts_infos_columns:
10613
10614                    # AS field
10615                    as_field = field
10616
10617                    # Rename
10618                    if column_rename:
10619                        as_field = column_rename.get(as_field, as_field)
10620
10621                    # Clean
10622                    if column_clean:
10623                        as_field = clean_annotation_field(as_field)
10624
10625                    # Case
10626                    if column_case:
10627                        if column_case.lower() in ["lower"]:
10628                            as_field = as_field.lower()
10629                        elif column_case.lower() in ["upper"]:
10630                            as_field = as_field.upper()
10631
10632                    # Clause select Variants
10633                    clause_select_variants.append(
10634                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10635                    )
10636
10637                    if field in [transcripts_column]:
10638                        clause_select_tanscripts.append(
10639                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10640                        )
10641                    else:
10642                        clause_select_tanscripts.append(
10643                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10644                        )
10645                        annotation_fields.append(as_field)
10646
10647                # Querey View
10648                query = f""" 
10649                    SELECT
10650                        "#CHROM", POS, REF, ALT, INFO,
10651                        "{transcripts_column}" AS 'transcript',
10652                        {", ".join(clause_select_tanscripts)}
10653                    FROM (
10654                        SELECT 
10655                            "#CHROM", POS, REF, ALT, INFO,
10656                            {", ".join(clause_select_variants)}
10657                        FROM {table_variants}
10658                        )
10659                    WHERE "{transcripts_column}" IS NOT NULL
10660                """
10661
10662                # Create temporary table
10663                temporary_table = transcripts_table + "".join(
10664                    random.choices(string.ascii_uppercase + string.digits, k=10)
10665                )
10666
10667                # Temporary_tables
10668                temporary_tables.append(temporary_table)
10669                query_view = f"""
10670                    CREATE TEMPORARY TABLE {temporary_table}
10671                    AS ({query})
10672                """
10673                self.execute_query(query=query_view)
10674
10675        return added_columns, temporary_tables, annotation_fields
10676
10677    def create_transcript_view_from_column_format(
10678        self,
10679        transcripts_table: str = "transcripts",
10680        column_formats: dict = {},
10681        temporary_tables: list = None,
10682        annotation_fields: list = None,
10683        column_rename: dict = {},
10684        column_clean: bool = False,
10685        column_case: str = None,
10686    ) -> tuple[list, list, list]:
10687        """
10688        The `create_transcript_view_from_column_format` function generates a transcript view based on
10689        specified column formats, adds additional columns and annotation fields, and returns the list of
10690        temporary tables and annotation fields.
10691
10692        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10693        of the table containing the transcripts data. This table will be used as the base table for
10694        creating the transcript view. The default value for this parameter is "transcripts", but you can
10695        provide a different table name if needed, defaults to transcripts
10696        :type transcripts_table: str (optional)
10697        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10698        about the columns to be used for creating the transcript view. Each entry in the dictionary
10699        specifies the mapping between a transcripts column and a transcripts infos column. This
10700        parameter allows you to define how the columns from the transcripts table should be transformed
10701        or mapped
10702        :type column_formats: dict
10703        :param temporary_tables: The `temporary_tables` parameter in the
10704        `create_transcript_view_from_column_format` function is a list that stores the names of
10705        temporary views created during the process of creating a transcript view from a column format.
10706        These temporary views are used to manipulate and extract data before generating the final
10707        transcript view
10708        :type temporary_tables: list
10709        :param annotation_fields: The `annotation_fields` parameter in the
10710        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10711        that are extracted from the temporary views created during the process. These annotation fields
10712        are obtained by querying the temporary views and extracting the column names excluding specific
10713        columns like `#CH
10714        :type annotation_fields: list
10715        :param column_rename: The `column_rename` parameter in the
10716        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10717        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10718        column names to new column names in this dictionary, you can rename specific columns during the
10719        process
10720        :type column_rename: dict
10721        :param column_clean: The `column_clean` parameter in the
10722        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10723        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10724        will be cleaned during the creation of the transcript view based on the specified column format,
10725        defaults to False
10726        :type column_clean: bool (optional)
10727        :param column_case: The `column_case` parameter in the
10728        `create_transcript_view_from_column_format` function is used to specify the case transformation
10729        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10730        to convert the column names to uppercase or lowercase, respectively
10731        :type column_case: str
10732        :return: The `create_transcript_view_from_column_format` function returns two lists:
10733        `temporary_tables` and `annotation_fields`.
10734        """
10735
10736        log.debug("Start transcrpts view creation from column format...")
10737
10738        #  "from_column_format": [
10739        #     {
10740        #         "transcripts_column": "ANN",
10741        #         "transcripts_infos_column": "Feature_ID",
10742        #     }
10743        # ],
10744
10745        # Init
10746        if temporary_tables is None:
10747            temporary_tables = []
10748        if annotation_fields is None:
10749            annotation_fields = []
10750
10751        for column_format in column_formats:
10752
10753            # annotation field and transcript annotation field
10754            annotation_field = column_format.get("transcripts_column", "ANN")
10755            transcript_annotation = column_format.get(
10756                "transcripts_infos_column", "Feature_ID"
10757            )
10758
10759            # Transcripts infos columns rename
10760            column_rename = column_format.get("column_rename", column_rename)
10761
10762            # Transcripts infos columns clean
10763            column_clean = column_format.get("column_clean", column_clean)
10764
10765            # Transcripts infos columns case
10766            column_case = column_format.get("column_case", column_case)
10767
10768            # Temporary View name
10769            temporary_view_name = transcripts_table + "".join(
10770                random.choices(string.ascii_uppercase + string.digits, k=10)
10771            )
10772
10773            # Create temporary view name
10774            temporary_view_name = self.annotation_format_to_table(
10775                uniquify=True,
10776                annotation_field=annotation_field,
10777                view_name=temporary_view_name,
10778                annotation_id=transcript_annotation,
10779                column_rename=column_rename,
10780                column_clean=column_clean,
10781                column_case=column_case,
10782            )
10783
10784            # Annotation fields
10785            if temporary_view_name:
10786                query_annotation_fields = f"""
10787                    SELECT *
10788                    FROM (
10789                        DESCRIBE SELECT *
10790                        FROM {temporary_view_name}
10791                        )
10792                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10793                """
10794                df_annotation_fields = self.get_query_to_df(
10795                    query=query_annotation_fields
10796                )
10797
10798                # Add temporary view and annotation fields
10799                temporary_tables.append(temporary_view_name)
10800                annotation_fields += list(set(df_annotation_fields["column_name"]))
10801
10802        return temporary_tables, annotation_fields
10803
10804    def create_transcript_view(
10805        self,
10806        transcripts_table: str = None,
10807        transcripts_table_drop: bool = False,
10808        param: dict = {},
10809    ) -> str:
10810        """
10811        The `create_transcript_view` function generates a transcript view by processing data from a
10812        specified table based on provided parameters and structural information.
10813
10814        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10815        is used to specify the name of the table that will store the final transcript view data. If a table
10816        name is not provided, the function will create a new table to store the transcript view data, and by
10817        default,, defaults to transcripts
10818        :type transcripts_table: str (optional)
10819        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10820        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10821        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10822        the function will drop the existing transcripts table if it exists, defaults to False
10823        :type transcripts_table_drop: bool (optional)
10824        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10825        contains information needed to create a transcript view. It includes details such as the structure
10826        of the transcripts, columns mapping, column formats, and other necessary information for generating
10827        the view. This parameter allows for flexibility and customization
10828        :type param: dict
10829        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10830        created or modified during the execution of the function.
10831        """
10832
10833        log.debug("Start transcripts view creation...")
10834
10835        # Default
10836        transcripts_table_default = "transcripts"
10837
10838        # Param
10839        if not param:
10840            param = self.get_param()
10841
10842        # Struct
10843        struct = param.get("transcripts", {}).get("struct", None)
10844
10845        # Transcript veresion
10846        transcript_id_remove_version = param.get("transcripts", {}).get(
10847            "transcript_id_remove_version", False
10848        )
10849
10850        # Transcripts mapping
10851        transcript_id_mapping_file = param.get("transcripts", {}).get(
10852            "transcript_id_mapping_file", None
10853        )
10854
10855        # Transcripts mapping
10856        transcript_id_mapping_force = param.get("transcripts", {}).get(
10857            "transcript_id_mapping_force", None
10858        )
10859
10860        if struct:
10861
10862            # Transcripts table
10863            if transcripts_table is None:
10864                transcripts_table = param.get("transcripts", {}).get(
10865                    "table", transcripts_table_default
10866                )
10867
10868            # added_columns
10869            added_columns = []
10870
10871            # Temporary tables
10872            temporary_tables = []
10873
10874            # Annotation fields
10875            annotation_fields = []
10876
10877            # from columns map
10878            columns_maps = struct.get("from_columns_map", [])
10879            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10880                self.create_transcript_view_from_columns_map(
10881                    transcripts_table=transcripts_table,
10882                    columns_maps=columns_maps,
10883                    added_columns=added_columns,
10884                    temporary_tables=temporary_tables,
10885                    annotation_fields=annotation_fields,
10886                )
10887            )
10888            added_columns += added_columns_tmp
10889            temporary_tables += temporary_tables_tmp
10890            annotation_fields += annotation_fields_tmp
10891
10892            # from column format
10893            column_formats = struct.get("from_column_format", [])
10894            temporary_tables_tmp, annotation_fields_tmp = (
10895                self.create_transcript_view_from_column_format(
10896                    transcripts_table=transcripts_table,
10897                    column_formats=column_formats,
10898                    temporary_tables=temporary_tables,
10899                    annotation_fields=annotation_fields,
10900                )
10901            )
10902            temporary_tables += temporary_tables_tmp
10903            annotation_fields += annotation_fields_tmp
10904
10905            # Remove some specific fields/column
10906            annotation_fields = list(set(annotation_fields))
10907            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10908                if field in annotation_fields:
10909                    annotation_fields.remove(field)
10910
10911            # Merge temporary tables query
10912            query_merge = ""
10913            for temporary_table in list(set(temporary_tables)):
10914
10915                # First temporary table
10916                if not query_merge:
10917                    query_merge = f"""
10918                        SELECT * FROM {temporary_table}
10919                    """
10920                # other temporary table (using UNION)
10921                else:
10922                    query_merge += f"""
10923                        UNION BY NAME SELECT * FROM {temporary_table}
10924                    """
10925
10926            # transcript table tmp
10927            transcript_table_tmp = "transcripts_tmp"
10928            transcript_table_tmp2 = "transcripts_tmp2"
10929            transcript_table_tmp3 = "transcripts_tmp3"
10930
10931            # Merge on transcript
10932            query_merge_on_transcripts_annotation_fields = []
10933
10934            # Add transcript list
10935            query_merge_on_transcripts_annotation_fields.append(
10936                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10937            )
10938
10939            # Aggregate all annotations fields
10940            for annotation_field in set(annotation_fields):
10941                query_merge_on_transcripts_annotation_fields.append(
10942                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10943                )
10944
10945            # Transcripts mapping
10946            if transcript_id_mapping_file:
10947
10948                # Transcript dataframe
10949                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10950                transcript_id_mapping_dataframe = transcripts_file_to_df(
10951                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10952                )
10953
10954                # Transcript version remove
10955                if transcript_id_remove_version:
10956                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10957                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10958                    query_left_join = f"""
10959                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10960                    """
10961                else:
10962                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10963                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10964                    query_left_join = f"""
10965                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10966                    """
10967
10968                # Transcript column for group by merge
10969                query_transcript_merge_group_by = """
10970                        CASE
10971                            WHEN transcript_mapped NOT IN ('')
10972                            THEN split_part(transcript_mapped, '.', 1)
10973                            ELSE split_part(transcript_original, '.', 1)
10974                        END
10975                    """
10976
10977                # Merge query
10978                transcripts_tmp2_query = f"""
10979                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10980                    FROM ({query_merge}) AS {transcript_table_tmp}
10981                    {query_left_join}
10982                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10983                """
10984
10985                # Retrive columns after mege
10986                transcripts_tmp2_describe_query = f"""
10987                    DESCRIBE {transcripts_tmp2_query}
10988                """
10989                transcripts_tmp2_describe_list = list(
10990                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10991                        "column_name"
10992                    ]
10993                )
10994
10995                # Create list of columns for select clause
10996                transcripts_tmp2_describe_select_clause = []
10997                for field in transcripts_tmp2_describe_list:
10998                    if field not in [
10999                        "#CHROM",
11000                        "POS",
11001                        "REF",
11002                        "ALT",
11003                        "INFO",
11004                        "transcript_mapped",
11005                    ]:
11006                        as_field = field
11007                        if field in ["transcript_original"]:
11008                            as_field = "transcripts_mapped"
11009                        transcripts_tmp2_describe_select_clause.append(
11010                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11011                        )
11012
11013                # Merge with mapping
11014                query_merge_on_transcripts = f"""
11015                    SELECT
11016                        "#CHROM", POS, REF, ALT, INFO,
11017                        CASE
11018                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11019                            THEN ANY_VALUE(transcript_mapped)
11020                            ELSE ANY_VALUE(transcript_original)
11021                        END AS transcript,
11022                        {", ".join(transcripts_tmp2_describe_select_clause)}
11023                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11024                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11025                        {query_transcript_merge_group_by}
11026                """
11027
11028                # Add transcript filter from mapping file
11029                if transcript_id_mapping_force:
11030                    query_merge_on_transcripts = f"""
11031                        SELECT *
11032                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11033                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11034                    """
11035
11036            # No transcript mapping
11037            else:
11038
11039                # Remove transcript version
11040                if transcript_id_remove_version:
11041                    query_transcript_column = f"""
11042                        split_part({transcript_table_tmp}.transcript, '.', 1)
11043                    """
11044                else:
11045                    query_transcript_column = """
11046                        transcript
11047                    """
11048
11049                # Query sections
11050                query_transcript_column_select = (
11051                    f"{query_transcript_column} AS transcript"
11052                )
11053                query_transcript_column_group_by = query_transcript_column
11054
11055                # Query for transcripts view
11056                query_merge_on_transcripts = f"""
11057                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11058                    FROM ({query_merge}) AS {transcript_table_tmp}
11059                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11060                """
11061
11062            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11063
11064            # Drop transcript view is necessary
11065            if transcripts_table_drop:
11066                query_drop = f"""
11067                    DROP TABLE IF EXISTS {transcripts_table};
11068                """
11069                self.execute_query(query=query_drop)
11070
11071            # Merge and create transcript view
11072            query_create_view = f"""
11073                CREATE TABLE IF NOT EXISTS {transcripts_table}
11074                AS {query_merge_on_transcripts}
11075            """
11076            self.execute_query(query=query_create_view)
11077
11078            # Remove added columns
11079            for added_column in added_columns:
11080                self.drop_column(column=added_column)
11081
11082        else:
11083
11084            transcripts_table = None
11085
11086        return transcripts_table
11087
11088    def annotation_format_to_table(
11089        self,
11090        uniquify: bool = True,
11091        annotation_field: str = "ANN",
11092        annotation_id: str = "Feature_ID",
11093        view_name: str = "transcripts",
11094        column_rename: dict = {},
11095        column_clean: bool = False,
11096        column_case: str = None,
11097    ) -> str:
11098        """
11099        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11100        structured table format, ensuring unique values and creating a temporary table for further
11101        processing or analysis.
11102
11103        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11104        unique values in the output or not. If set to `True`, the function will make sure that the
11105        output values are unique, defaults to True
11106        :type uniquify: bool (optional)
11107        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11108        that contains the annotation information for each variant. This field is used to extract the
11109        annotation details for further processing in the function. By default, it is set to "ANN",
11110        defaults to ANN
11111        :type annotation_field: str (optional)
11112        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11113        is used to specify the identifier for the annotation feature. This identifier will be used as a
11114        column name in the resulting table or view that is created based on the annotation data. It
11115        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11116        :type annotation_id: str (optional)
11117        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11118        to specify the name of the temporary table that will be created to store the transformed
11119        annotation data. This table will hold the extracted information from the annotation field in a
11120        structured format for further processing or analysis. By default,, defaults to transcripts
11121        :type view_name: str (optional)
11122        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11123        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11124        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11125        created based on the annotation data. This feature enables
11126        :type column_rename: dict
11127        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11128        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11129        If set to `True`, the function will clean the annotation field before further processing. This
11130        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11131        to False
11132        :type column_clean: bool (optional)
11133        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11134        used to specify the case transformation to be applied to the column names extracted from the
11135        annotation data. It allows you to set the case of the column names to either lowercase or
11136        uppercase for consistency or other specific requirements during the conversion
11137        :type column_case: str
11138        :return: The function `annotation_format_to_table` is returning the name of the view created,
11139        which is stored in the variable `view_name`.
11140        """
11141
11142        # Annotation field
11143        annotation_format = "annotation_explode"
11144
11145        # Transcript annotation
11146        if column_rename:
11147            annotation_id = column_rename.get(annotation_id, annotation_id)
11148
11149        if column_clean:
11150            annotation_id = clean_annotation_field(annotation_id)
11151
11152        # Prefix
11153        prefix = self.get_explode_infos_prefix()
11154        if prefix:
11155            prefix = "INFO/"
11156
11157        # Annotation fields
11158        annotation_infos = prefix + annotation_field
11159        annotation_format_infos = prefix + annotation_format
11160
11161        # Variants table
11162        table_variants = self.get_table_variants()
11163
11164        # Header
11165        vcf_reader = self.get_header()
11166
11167        # Add columns
11168        added_columns = []
11169
11170        # Explode HGVS field in column
11171        added_columns += self.explode_infos(fields=[annotation_field])
11172
11173        if annotation_field in vcf_reader.infos:
11174
11175            # Extract ANN header
11176            ann_description = vcf_reader.infos[annotation_field].desc
11177            pattern = r"'(.+?)'"
11178            match = re.search(pattern, ann_description)
11179            if match:
11180                ann_header_match = match.group(1).split(" | ")
11181                ann_header = []
11182                ann_header_desc = {}
11183                for i in range(len(ann_header_match)):
11184                    ann_header_info = "".join(
11185                        char for char in ann_header_match[i] if char.isalnum()
11186                    )
11187                    ann_header.append(ann_header_info)
11188                    ann_header_desc[ann_header_info] = ann_header_match[i]
11189                if not ann_header_desc:
11190                    raise ValueError("Invalid header description format")
11191            else:
11192                raise ValueError("Invalid header description format")
11193
11194            # Create variant id
11195            variant_id_column = self.get_variant_id_column()
11196            added_columns += [variant_id_column]
11197
11198            # Create dataframe
11199            dataframe_annotation_format = self.get_query_to_df(
11200                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11201            )
11202
11203            # Create annotation columns
11204            dataframe_annotation_format[
11205                annotation_format_infos
11206            ] = dataframe_annotation_format[annotation_infos].apply(
11207                lambda x: explode_annotation_format(
11208                    annotation=str(x),
11209                    uniquify=uniquify,
11210                    output_format="JSON",
11211                    prefix="",
11212                    header=list(ann_header_desc.values()),
11213                )
11214            )
11215
11216            # Find keys
11217            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11218            df_keys = self.get_query_to_df(query=query_json)
11219
11220            # Check keys
11221            query_json_key = []
11222            for _, row in df_keys.iterrows():
11223
11224                # Key
11225                key = row.iloc[0]
11226                key_clean = key
11227
11228                # key rename
11229                if column_rename:
11230                    key_clean = column_rename.get(key_clean, key_clean)
11231
11232                # key clean
11233                if column_clean:
11234                    key_clean = clean_annotation_field(key_clean)
11235
11236                # Key case
11237                if column_case:
11238                    if column_case.lower() in ["lower"]:
11239                        key_clean = key_clean.lower()
11240                    elif column_case.lower() in ["upper"]:
11241                        key_clean = key_clean.upper()
11242
11243                # Type
11244                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11245
11246                # Get DataFrame from query
11247                df_json_type = self.get_query_to_df(query=query_json_type)
11248
11249                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11250                with pd.option_context("future.no_silent_downcasting", True):
11251                    df_json_type.fillna(value="", inplace=True)
11252                    replace_dict = {None: np.nan, "": np.nan}
11253                    df_json_type.replace(replace_dict, inplace=True)
11254                    df_json_type.dropna(inplace=True)
11255
11256                # Detect column type
11257                column_type = detect_column_type(df_json_type[key_clean])
11258
11259                # Append
11260                query_json_key.append(
11261                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11262                )
11263
11264            # Create view
11265            query_view = f"""
11266                CREATE TEMPORARY TABLE {view_name}
11267                AS (
11268                    SELECT *, {annotation_id} AS 'transcript'
11269                    FROM (
11270                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11271                        FROM dataframe_annotation_format
11272                        )
11273                    );
11274            """
11275            self.execute_query(query=query_view)
11276
11277        else:
11278
11279            # Return None
11280            view_name = None
11281
11282        # Remove added columns
11283        for added_column in added_columns:
11284            self.drop_column(column=added_column)
11285
11286        return view_name
11287
11288    def transcript_view_to_variants(
11289        self,
11290        transcripts_table: str = None,
11291        transcripts_column_id: str = None,
11292        transcripts_info_json: str = None,
11293        transcripts_info_field_json: str = None,
11294        transcripts_info_format: str = None,
11295        transcripts_info_field_format: str = None,
11296        param: dict = {},
11297    ) -> bool:
11298        """
11299        The `transcript_view_to_variants` function updates a variants table with information from
11300        transcripts in JSON format.
11301
11302        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11303        table containing the transcripts data. If this parameter is not provided, the function will
11304        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11305        :type transcripts_table: str
11306        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11307        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11308        identifier is used to match transcripts with variants in the database
11309        :type transcripts_column_id: str
11310        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11311        of the column in the variants table where the transcripts information will be stored in JSON
11312        format. This parameter allows you to define the column in the variants table that will hold the
11313        JSON-formatted information about transcripts
11314        :type transcripts_info_json: str
11315        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11316        specify the field in the VCF header that will contain information about transcripts in JSON
11317        format. This field will be added to the VCF header as an INFO field with the specified name
11318        :type transcripts_info_field_json: str
11319        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11320        format of the information about transcripts that will be stored in the variants table. This
11321        format can be used to define how the transcript information will be structured or displayed
11322        within the variants table
11323        :type transcripts_info_format: str
11324        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11325        specify the field in the VCF header that will contain information about transcripts in a
11326        specific format. This field will be added to the VCF header as an INFO field with the specified
11327        name
11328        :type transcripts_info_field_format: str
11329        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11330        that contains various configuration settings related to transcripts. It is used to provide
11331        default values for certain parameters if they are not explicitly provided when calling the
11332        method. The `param` dictionary can be passed as an argument
11333        :type param: dict
11334        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11335        if the operation is successful and `False` if certain conditions are not met.
11336        """
11337
11338        msg_info_prefix = "Start transcripts view to variants annotations"
11339
11340        log.debug(f"{msg_info_prefix}...")
11341
11342        # Default
11343        transcripts_table_default = "transcripts"
11344        transcripts_column_id_default = "transcript"
11345        transcripts_info_json_default = None
11346        transcripts_info_format_default = None
11347        transcripts_info_field_json_default = None
11348        transcripts_info_field_format_default = None
11349
11350        # Param
11351        if not param:
11352            param = self.get_param()
11353
11354        # Transcripts table
11355        if transcripts_table is None:
11356            transcripts_table = param.get("transcripts", {}).get(
11357                "table", transcripts_table_default
11358            )
11359
11360        # Transcripts column ID
11361        if transcripts_column_id is None:
11362            transcripts_column_id = param.get("transcripts", {}).get(
11363                "column_id", transcripts_column_id_default
11364            )
11365
11366        # Transcripts info json
11367        if transcripts_info_json is None:
11368            transcripts_info_json = param.get("transcripts", {}).get(
11369                "transcripts_info_json", transcripts_info_json_default
11370            )
11371
11372        # Transcripts info field JSON
11373        if transcripts_info_field_json is None:
11374            transcripts_info_field_json = param.get("transcripts", {}).get(
11375                "transcripts_info_field_json", transcripts_info_field_json_default
11376            )
11377        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11378        #     transcripts_info_json = transcripts_info_field_json
11379
11380        # Transcripts info format
11381        if transcripts_info_format is None:
11382            transcripts_info_format = param.get("transcripts", {}).get(
11383                "transcripts_info_format", transcripts_info_format_default
11384            )
11385
11386        # Transcripts info field FORMAT
11387        if transcripts_info_field_format is None:
11388            transcripts_info_field_format = param.get("transcripts", {}).get(
11389                "transcripts_info_field_format", transcripts_info_field_format_default
11390            )
11391        # if (
11392        #     transcripts_info_field_format is not None
11393        #     and transcripts_info_format is None
11394        # ):
11395        #     transcripts_info_format = transcripts_info_field_format
11396
11397        # Variants table
11398        table_variants = self.get_table_variants()
11399
11400        # Check info columns param
11401        if (
11402            transcripts_info_json is None
11403            and transcripts_info_field_json is None
11404            and transcripts_info_format is None
11405            and transcripts_info_field_format is None
11406        ):
11407            return False
11408
11409        # Transcripts infos columns
11410        query_transcripts_infos_columns = f"""
11411            SELECT *
11412            FROM (
11413                DESCRIBE SELECT * FROM {transcripts_table}
11414                )
11415            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11416        """
11417        transcripts_infos_columns = list(
11418            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11419        )
11420
11421        # View results
11422        clause_select = []
11423        clause_to_json = []
11424        clause_to_format = []
11425        for field in transcripts_infos_columns:
11426            # Do not consider INFO field for export into fields
11427            if field not in ["INFO"]:
11428                clause_select.append(
11429                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11430                )
11431                clause_to_json.append(f""" '{field}': "{field}" """)
11432                clause_to_format.append(f""" "{field}" """)
11433
11434        # Update
11435        update_set_json = []
11436        update_set_format = []
11437
11438        # VCF header
11439        vcf_reader = self.get_header()
11440
11441        # Transcripts to info column in JSON
11442        if transcripts_info_json:
11443
11444            # Create column on variants table
11445            self.add_column(
11446                table_name=table_variants,
11447                column_name=transcripts_info_json,
11448                column_type="JSON",
11449                default_value=None,
11450                drop=False,
11451            )
11452
11453            # Add header
11454            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11455                transcripts_info_json,
11456                ".",
11457                "String",
11458                "Transcripts in JSON format",
11459                "unknwon",
11460                "unknwon",
11461                self.code_type_map["String"],
11462            )
11463
11464            # Add to update
11465            update_set_json.append(
11466                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11467            )
11468
11469        # Transcripts to info field in JSON
11470        if transcripts_info_field_json:
11471
11472            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11473
11474            # Add to update
11475            update_set_json.append(
11476                f""" 
11477                    INFO = concat(
11478                            CASE
11479                                WHEN INFO NOT IN ('', '.')
11480                                THEN INFO
11481                                ELSE ''
11482                            END,
11483                            CASE
11484                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11485                                THEN concat(
11486                                    ';{transcripts_info_field_json}=',
11487                                    t.{transcripts_info_json}
11488                                )
11489                                ELSE ''
11490                            END
11491                            )
11492                """
11493            )
11494
11495            # Add header
11496            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11497                transcripts_info_field_json,
11498                ".",
11499                "String",
11500                "Transcripts in JSON format",
11501                "unknwon",
11502                "unknwon",
11503                self.code_type_map["String"],
11504            )
11505
11506        if update_set_json:
11507
11508            # Update query
11509            query_update = f"""
11510                UPDATE {table_variants}
11511                    SET {", ".join(update_set_json)}
11512                FROM
11513                (
11514                    SELECT
11515                        "#CHROM", POS, REF, ALT,
11516                            concat(
11517                            '{{',
11518                            string_agg(
11519                                '"' || "{transcripts_column_id}" || '":' ||
11520                                to_json(json_output)
11521                            ),
11522                            '}}'
11523                            )::JSON AS {transcripts_info_json}
11524                    FROM
11525                        (
11526                        SELECT
11527                            "#CHROM", POS, REF, ALT,
11528                            "{transcripts_column_id}",
11529                            to_json(
11530                                {{{",".join(clause_to_json)}}}
11531                            )::JSON AS json_output
11532                        FROM
11533                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11534                        WHERE "{transcripts_column_id}" IS NOT NULL
11535                        )
11536                    GROUP BY "#CHROM", POS, REF, ALT
11537                ) AS t
11538                WHERE {table_variants}."#CHROM" = t."#CHROM"
11539                    AND {table_variants}."POS" = t."POS"
11540                    AND {table_variants}."REF" = t."REF"
11541                    AND {table_variants}."ALT" = t."ALT"
11542            """
11543
11544            self.execute_query(query=query_update)
11545
11546        # Transcripts to info column in FORMAT
11547        if transcripts_info_format:
11548
11549            # Create column on variants table
11550            self.add_column(
11551                table_name=table_variants,
11552                column_name=transcripts_info_format,
11553                column_type="VARCHAR",
11554                default_value=None,
11555                drop=False,
11556            )
11557
11558            # Add header
11559            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11560                transcripts_info_format,
11561                ".",
11562                "String",
11563                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11564                "unknwon",
11565                "unknwon",
11566                self.code_type_map["String"],
11567            )
11568
11569            # Add to update
11570            update_set_format.append(
11571                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11572            )
11573
11574        else:
11575
11576            # Set variable for internal queries
11577            transcripts_info_format = "transcripts_info_format"
11578
11579        # Transcripts to info field in JSON
11580        if transcripts_info_field_format:
11581
11582            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11583
11584            # Add to update
11585            update_set_format.append(
11586                f""" 
11587                    INFO = concat(
11588                            CASE
11589                                WHEN INFO NOT IN ('', '.')
11590                                THEN INFO
11591                                ELSE ''
11592                            END,
11593                            CASE
11594                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11595                                THEN concat(
11596                                    ';{transcripts_info_field_format}=',
11597                                    t.{transcripts_info_format}
11598                                )
11599                                ELSE ''
11600                            END
11601                            )
11602                """
11603            )
11604
11605            # Add header
11606            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11607                transcripts_info_field_format,
11608                ".",
11609                "String",
11610                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11611                "unknwon",
11612                "unknwon",
11613                self.code_type_map["String"],
11614            )
11615
11616        if update_set_format:
11617
11618            # Update query
11619            query_update = f"""
11620                UPDATE {table_variants}
11621                    SET {", ".join(update_set_format)}
11622                FROM
11623                (
11624                    SELECT
11625                        "#CHROM", POS, REF, ALT,
11626                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11627                    FROM 
11628                        (
11629                        SELECT
11630                            "#CHROM", POS, REF, ALT,
11631                            "{transcripts_column_id}",
11632                            concat(
11633                                "{transcripts_column_id}",
11634                                '|',
11635                                {", '|', ".join(clause_to_format)}
11636                            ) AS {transcripts_info_format}
11637                        FROM
11638                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11639                        )
11640                    GROUP BY "#CHROM", POS, REF, ALT
11641                ) AS t
11642                WHERE {table_variants}."#CHROM" = t."#CHROM"
11643                    AND {table_variants}."POS" = t."POS"
11644                    AND {table_variants}."REF" = t."REF"
11645                    AND {table_variants}."ALT" = t."ALT"
11646            """
11647
11648            self.execute_query(query=query_update)
11649
11650        return True
11651
11652    def rename_info_fields(
11653        self, fields_to_rename: dict = None, table: str = None
11654    ) -> dict:
11655        """
11656        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11657        corresponding INFO fields in the variants table.
11658
11659        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11660        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11661        represent the original field names that need to be renamed, and the corresponding values
11662        represent the new names to which the fields should be
11663        :type fields_to_rename: dict
11664        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11665        the table in which the variants data is stored. This table contains information about genetic
11666        variants, and the function updates the corresponding INFO fields in this table when renaming
11667        specified fields in the VCF file header
11668        :type table: str
11669        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11670        the original field names as keys and their corresponding new names (or None if the field was
11671        removed) as values after renaming or removing specified fields in a VCF file header and updating
11672        corresponding INFO fields in the variants table.
11673        """
11674
11675        # Init
11676        fields_renamed = {}
11677        config = self.get_config()
11678        access = config.get("access")
11679
11680        if table is None:
11681            table = self.get_table_variants()
11682
11683        # regexp replace fonction
11684        regex_replace_dict = {}
11685        regex_replace_nb = 0
11686        regex_replace_partition = 125
11687        regex_replace = "INFO"
11688
11689        if fields_to_rename is not None and access not in ["RO"]:
11690
11691            log.info("Rename or remove fields...")
11692
11693            # Header
11694            header = self.get_header()
11695
11696            for field_to_rename, field_renamed in fields_to_rename.items():
11697
11698                if field_to_rename in header.infos:
11699
11700                    # Rename header
11701                    if field_renamed is not None:
11702                        header.infos[field_renamed] = vcf.parser._Info(
11703                            field_renamed,
11704                            header.infos[field_to_rename].num,
11705                            header.infos[field_to_rename].type,
11706                            header.infos[field_to_rename].desc,
11707                            header.infos[field_to_rename].source,
11708                            header.infos[field_to_rename].version,
11709                            header.infos[field_to_rename].type_code,
11710                        )
11711                    del header.infos[field_to_rename]
11712
11713                    # Rename INFO patterns
11714                    field_pattern = rf'(^|;)({field_to_rename})=([^;]*)'
11715                    if field_renamed is not None:
11716                        field_renamed_pattern = rf'\1{field_renamed}=\3'
11717                    else:
11718                        field_renamed_pattern = ''
11719
11720                    # regexp replace
11721                    regex_replace_nb += 1
11722                    regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition)
11723                    if (regex_replace_nb % regex_replace_partition) == 0:
11724                        regex_replace = "INFO"
11725                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11726                    regex_replace_dict[regex_replace_key] = regex_replace
11727
11728                    # Return
11729                    fields_renamed[field_to_rename] = field_renamed
11730
11731                    # Log
11732                    if field_renamed is not None:
11733                        log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'")
11734                    else:
11735                        log.info(f"Rename or remove fields - field '{field_to_rename}' removed")
11736
11737            # Rename INFO
11738            for regex_replace_key, regex_replace  in regex_replace_dict.items():
11739                log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...")
11740                query = f"""
11741                    UPDATE {table}
11742                    SET
11743                        INFO = {regex_replace}
11744                """
11745                log.debug(f"query={query}")
11746                self.execute_query(query=query)
11747
11748        return fields_renamed
11749
11750    def calculation_rename_info_fields(
11751        self,
11752        fields_to_rename: dict = None,
11753        table: str = None,
11754        operation_name: str = "RENAME_INFO_FIELDS",
11755    ) -> None:
11756        """
11757        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11758        fields to rename and table if provided, and then calls another function to rename the fields.
11759
11760        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11761        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11762        the key and the new field name as the value
11763        :type fields_to_rename: dict
11764        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11765        specify the name of the table for which the fields are to be renamed. It is a string type
11766        parameter
11767        :type table: str
11768        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11769        method is a string that specifies the name of the operation being performed. In this context, it
11770        is used as a default value for the operation name if not explicitly provided when calling the
11771        function, defaults to RENAME_INFO_FIELDS
11772        :type operation_name: str (optional)
11773        """
11774
11775        # Param
11776        param = self.get_param()
11777
11778        # Get param fields to rename
11779        param_fields_to_rename = (
11780            param.get("calculation", {})
11781            .get("calculations", {})
11782            .get(operation_name, {})
11783            .get("fields_to_rename", None)
11784        )
11785
11786        # Get param table
11787        param_table = (
11788            param.get("calculation", {})
11789            .get("calculations", {})
11790            .get(operation_name, {})
11791            .get("table", None)
11792        )
11793
11794        # Init fields_to_rename
11795        if fields_to_rename is None:
11796            fields_to_rename = param_fields_to_rename
11797
11798        # Init table
11799        if table is None:
11800            table = param_table
11801
11802        renamed_fields = self.rename_info_fields(
11803            fields_to_rename=fields_to_rename, table=table
11804        )
11805
11806        log.debug(f"renamed_fields:{renamed_fields}")
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
39    def __init__(
40        self,
41        conn=None,
42        input: str = None,
43        output: str = None,
44        config: dict = {},
45        param: dict = {},
46        load: bool = False,
47    ) -> None:
48        """
49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
50        header
51
52        :param conn: the connection to the database
53        :param input: the input file
54        :param output: the output file
55        :param config: a dictionary containing the configuration of the model
56        :param param: a dictionary containing the parameters of the model
57        """
58
59        # Init variables
60        self.init_variables()
61
62        # Input
63        self.set_input(input)
64
65        # Config
66        self.set_config(config)
67
68        # Param
69        self.set_param(param)
70
71        # Output
72        self.set_output(output)
73
74        # connexion
75        self.set_connexion(conn)
76
77        # Header
78        self.set_header()
79
80        # Samples
81        self.set_samples()
82
83        # Load data
84        if load:
85            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 87    def set_samples(self, samples: list = None) -> list:
 88        """
 89        The function `set_samples` sets the samples attribute of an object to a provided list or
 90        retrieves it from a parameter dictionary.
 91
 92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 93        input and sets the `samples` attribute of the class to the provided list. If no samples are
 94        provided, it tries to get the samples from the class's parameters using the `get_param` method
 95        :type samples: list
 96        :return: The `samples` list is being returned.
 97        """
 98
 99        if not samples:
100            samples = self.get_param().get("samples", {}).get("list", None)
101
102        self.samples = samples
103
104        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
106    def get_samples(self) -> list:
107        """
108        This function returns a list of samples.
109        :return: The `get_samples` method is returning the `samples` attribute of the object.
110        """
111
112        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
114    def get_samples_check(self) -> bool:
115        """
116        This function returns the value of the "check" key within the "samples" dictionary retrieved
117        from the parameters.
118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
120        method. If the key "check" is not found, it will return `False`.
121        """
122
123        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
125    def set_input(self, input: str = None) -> None:
126        """
127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
128        attributes in the class accordingly.
129
130        :param input: The `set_input` method in the provided code snippet is used to set attributes
131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
132        :type input: str
133        """
134
135        if input and not isinstance(input, str):
136            try:
137                self.input = input.name
138            except:
139                log.error(f"Input file '{input} in bad format")
140                raise ValueError(f"Input file '{input} in bad format")
141        else:
142            self.input = input
143
144        # Input format
145        if input:
146            input_name, input_extension = os.path.splitext(self.input)
147            self.input_name = input_name
148            self.input_extension = input_extension
149            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
151    def set_config(self, config: dict) -> None:
152        """
153        The set_config function takes a config object and assigns it as the configuration object for the
154        class.
155
156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
157        contains configuration settings for the class. When you call the `set_config` function with a
158        dictionary object as the argument, it will set that dictionary as the configuration object for
159        the class
160        :type config: dict
161        """
162
163        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
165    def set_param(self, param: dict) -> None:
166        """
167        This function sets a parameter object for the class based on the input dictionary.
168
169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
170        as the `param` attribute of the class instance
171        :type param: dict
172        """
173
174        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
176    def init_variables(self) -> None:
177        """
178        This function initializes the variables that will be used in the rest of the class
179        """
180
181        self.prefix = "howard"
182        self.table_variants = "variants"
183        self.dataframe = None
184
185        self.comparison_map = {
186            "gt": ">",
187            "gte": ">=",
188            "lt": "<",
189            "lte": "<=",
190            "equals": "=",
191            "contains": "SIMILAR TO",
192        }
193
194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
195
196        self.code_type_map_to_sql = {
197            "Integer": "INTEGER",
198            "String": "VARCHAR",
199            "Float": "FLOAT",
200            "Flag": "VARCHAR",
201        }
202
203        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
205    def get_indexing(self) -> bool:
206        """
207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
208        returns False.
209        :return: The value of the indexing parameter.
210        """
211
212        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
214    def get_connexion_config(self) -> dict:
215        """
216        The function `get_connexion_config` returns a dictionary containing the configuration for a
217        connection, including the number of threads and memory limit.
218        :return: a dictionary containing the configuration for the Connexion library.
219        """
220
221        # config
222        config = self.get_config()
223
224        # Connexion config
225        connexion_config = {}
226        threads = self.get_threads()
227
228        # Threads
229        if threads:
230            connexion_config["threads"] = threads
231
232        # Memory
233        # if config.get("memory", None):
234        #     connexion_config["memory_limit"] = config.get("memory")
235        if self.get_memory():
236            connexion_config["memory_limit"] = self.get_memory()
237
238        # Temporary directory
239        if config.get("tmp", None):
240            connexion_config["temp_directory"] = config.get("tmp")
241
242        # Access
243        if config.get("access", None):
244            access = config.get("access")
245            if access in ["RO"]:
246                access = "READ_ONLY"
247            elif access in ["RW"]:
248                access = "READ_WRITE"
249            connexion_db = self.get_connexion_db()
250            if connexion_db in ":memory:":
251                access = "READ_WRITE"
252            connexion_config["access_mode"] = access
253
254        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
256    def get_duckdb_settings(self) -> dict:
257        """
258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
259        string.
260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
261        """
262
263        # config
264        config = self.get_config()
265
266        # duckdb settings
267        duckdb_settings_dict = {}
268        if config.get("duckdb_settings", None):
269            duckdb_settings = config.get("duckdb_settings")
270            duckdb_settings = full_path(duckdb_settings)
271            # duckdb setting is a file
272            if os.path.exists(duckdb_settings):
273                with open(duckdb_settings) as json_file:
274                    duckdb_settings_dict = yaml.safe_load(json_file)
275            # duckdb settings is a string
276            else:
277                duckdb_settings_dict = json.loads(duckdb_settings)
278
279        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
281    def set_connexion_db(self) -> str:
282        """
283        The function `set_connexion_db` returns the appropriate database connection string based on the
284        input format and connection type.
285        :return: the value of the variable `connexion_db`.
286        """
287
288        # Default connexion db
289        default_connexion_db = ":memory:"
290
291        # Find connexion db
292        if self.get_input_format() in ["db", "duckdb"]:
293            connexion_db = self.get_input()
294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
295            connexion_db = default_connexion_db
296        elif self.get_connexion_type() in ["tmpfile"]:
297            tmp_name = tempfile.mkdtemp(
298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
299            )
300            connexion_db = f"{tmp_name}/tmp.db"
301        elif self.get_connexion_type() != "":
302            connexion_db = self.get_connexion_type()
303        else:
304            connexion_db = default_connexion_db
305
306        # Set connexion db
307        self.connexion_db = connexion_db
308
309        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
311    def set_connexion(self, conn) -> None:
312        """
313        The function `set_connexion` creates a connection to a database, with options for different
314        database formats and settings.
315
316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
317        database. If a connection is not provided, a new connection to an in-memory database is created.
318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
319        sqlite
320        """
321
322        # Connexion db
323        connexion_db = self.set_connexion_db()
324
325        # Connexion config
326        connexion_config = self.get_connexion_config()
327
328        # Connexion format
329        connexion_format = self.get_config().get("connexion_format", "duckdb")
330        # Set connexion format
331        self.connexion_format = connexion_format
332
333        # Connexion
334        if not conn:
335            if connexion_format in ["duckdb"]:
336                conn = duckdb.connect(connexion_db, config=connexion_config)
337                # duckDB settings
338                duckdb_settings = self.get_duckdb_settings()
339                if duckdb_settings:
340                    for setting in duckdb_settings:
341                        setting_value = duckdb_settings.get(setting)
342                        if isinstance(setting_value, str):
343                            setting_value = f"'{setting_value}'"
344                        conn.execute(f"PRAGMA {setting}={setting_value};")
345            elif connexion_format in ["sqlite"]:
346                conn = sqlite3.connect(connexion_db)
347
348        # Set connexion
349        self.conn = conn
350
351        # Log
352        log.debug(f"connexion_format: {connexion_format}")
353        log.debug(f"connexion_db: {connexion_db}")
354        log.debug(f"connexion config: {connexion_config}")
355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
357    def set_output(self, output: str = None) -> None:
358        """
359        The `set_output` function in Python sets the output file based on the input or a specified key
360        in the config file, extracting the output name, extension, and format.
361
362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
363        the output file. If the config file has an 'output' key, the method sets the output to the value
364        of that key. If no output is provided, it sets the output to `None`
365        :type output: str
366        """
367
368        if output and not isinstance(output, str):
369            self.output = output.name
370        else:
371            self.output = output
372
373        # Output format
374        if self.output:
375            output_name, output_extension = os.path.splitext(self.output)
376            self.output_name = output_name
377            self.output_extension = output_extension
378            self.output_format = self.output_extension.replace(".", "")
379        else:
380            self.output_name = None
381            self.output_extension = None
382            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
384    def set_header(self) -> None:
385        """
386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
387        """
388
389        input_file = self.get_input()
390        default_header_list = [
391            "##fileformat=VCFv4.2",
392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
393        ]
394
395        # Full path
396        input_file = full_path(input_file)
397
398        if input_file:
399
400            input_format = self.get_input_format()
401            input_compressed = self.get_input_compressed()
402            config = self.get_config()
403            header_list = default_header_list
404            if input_format in [
405                "vcf",
406                "hdr",
407                "tsv",
408                "csv",
409                "psv",
410                "parquet",
411                "db",
412                "duckdb",
413            ]:
414                # header provided in param
415                if config.get("header_file", None):
416                    with open(config.get("header_file"), "rt") as f:
417                        header_list = self.read_vcf_header(f)
418                # within a vcf file format (header within input file itsself)
419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
420                    # within a compressed vcf file format (.vcf.gz)
421                    if input_compressed:
422                        with bgzf.open(input_file, "rt") as f:
423                            header_list = self.read_vcf_header(f)
424                    # within an uncompressed vcf file format (.vcf)
425                    else:
426                        with open(input_file, "rt") as f:
427                            header_list = self.read_vcf_header(f)
428                # header provided in default external file .hdr
429                elif os.path.exists((input_file + ".hdr")):
430                    with open(input_file + ".hdr", "rt") as f:
431                        header_list = self.read_vcf_header(f)
432                else:
433                    try:  # Try to get header info fields and file columns
434
435                        with tempfile.TemporaryDirectory() as tmpdir:
436
437                            # Create database
438                            db_for_header = Database(database=input_file)
439
440                            # Get header columns for infos fields
441                            db_header_from_columns = (
442                                db_for_header.get_header_from_columns()
443                            )
444
445                            # Get real columns in the file
446                            db_header_columns = db_for_header.get_columns()
447
448                            # Write header file
449                            header_file_tmp = os.path.join(tmpdir, "header")
450                            f = open(header_file_tmp, "w")
451                            vcf.Writer(f, db_header_from_columns)
452                            f.close()
453
454                            # Replace #CHROM line with rel columns
455                            header_list = db_for_header.read_header_file(
456                                header_file=header_file_tmp
457                            )
458                            header_list[-1] = "\t".join(db_header_columns)
459
460                    except:
461
462                        log.warning(
463                            f"No header for file {input_file}. Set as default VCF header"
464                        )
465                        header_list = default_header_list
466
467            else:  # try for unknown format ?
468
469                log.error(f"Input file format '{input_format}' not available")
470                raise ValueError(f"Input file format '{input_format}' not available")
471
472            if not header_list:
473                header_list = default_header_list
474
475            # header as list
476            self.header_list = header_list
477
478            # header as VCF object
479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
480
481        else:
482
483            self.header_list = None
484            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
487        """
488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
489        DataFrame based on the connection format.
490
491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
492        represents the SQL query you want to execute. This query will be used to fetch data from a
493        database and convert it into a pandas DataFrame
494        :type query: str
495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
497        function will only fetch up to that number of rows from the database query result. If no limit
498        is specified,
499        :type limit: int
500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
501        """
502
503        # Connexion format
504        connexion_format = self.get_connexion_format()
505
506        # Limit in query
507        if limit:
508            pd.set_option("display.max_rows", limit)
509            if connexion_format in ["duckdb"]:
510                df = (
511                    self.conn.execute(query)
512                    .fetch_record_batch(limit)
513                    .read_next_batch()
514                    .to_pandas()
515                )
516            elif connexion_format in ["sqlite"]:
517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
518
519        # Full query
520        else:
521            if connexion_format in ["duckdb"]:
522                df = self.conn.execute(query).df()
523            elif connexion_format in ["sqlite"]:
524                df = pd.read_sql_query(query, self.conn)
525
526        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
528    def get_overview(self) -> None:
529        """
530        The function prints the input, output, config, and dataframe of the current object
531        """
532        table_variants_from = self.get_table_variants(clause="from")
533        sql_columns = self.get_header_columns_as_sql()
534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
535        df = self.get_query_to_df(sql_query_export)
536        log.info(
537            "Input:  "
538            + str(self.get_input())
539            + " ["
540            + str(str(self.get_input_format()))
541            + "]"
542        )
543        log.info(
544            "Output: "
545            + str(self.get_output())
546            + " ["
547            + str(str(self.get_output_format()))
548            + "]"
549        )
550        log.info("Config: ")
551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
552            "\n"
553        ):
554            log.info("\t" + str(d))
555        log.info("Param: ")
556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
557            "\n"
558        ):
559            log.info("\t" + str(d))
560        log.info("Sample list: " + str(self.get_header_sample_list()))
561        log.info("Dataframe: ")
562        for d in str(df).split("\n"):
563            log.info("\t" + str(d))
564
565        # garbage collector
566        del df
567        gc.collect()
568
569        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
571    def get_stats(self) -> dict:
572        """
573        The `get_stats` function calculates and returns various statistics of the current object,
574        including information about the input file, variants, samples, header fields, quality, and
575        SNVs/InDels.
576        :return: a dictionary containing various statistics of the current object. The dictionary has
577        the following structure:
578        """
579
580        # Log
581        log.info(f"Stats Calculation...")
582
583        # table varaints
584        table_variants_from = self.get_table_variants()
585
586        # stats dict
587        stats = {"Infos": {}}
588
589        ### File
590        input_file = self.get_input()
591        stats["Infos"]["Input file"] = input_file
592
593        # Header
594        header_infos = self.get_header().infos
595        header_formats = self.get_header().formats
596        header_infos_list = list(header_infos)
597        header_formats_list = list(header_formats)
598
599        ### Variants
600
601        stats["Variants"] = {}
602
603        # Variants by chr
604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
607            by=["CHROM"], kind="quicksort"
608        )
609
610        # Total number of variants
611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
612
613        # Calculate percentage
614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
615            lambda x: (x / nb_of_variants)
616        )
617
618        stats["Variants"]["Number of variants by chromosome"] = (
619            nb_of_variants_by_chrom.to_dict(orient="index")
620        )
621
622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
623
624        ### Samples
625
626        # Init
627        samples = {}
628        nb_of_samples = 0
629
630        # Check Samples
631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
632            log.debug(f"Check samples...")
633            for sample in self.get_header_sample_list():
634                sql_query_samples = f"""
635                    SELECT  '{sample}' as sample,
636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
639                    FROM {table_variants_from}
640                    WHERE (
641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
642                        AND
643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
644                      )
645                    GROUP BY genotype
646                    """
647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
648                sample_genotype_count = sql_query_genotype_df["count"].sum()
649                if len(sql_query_genotype_df):
650                    nb_of_samples += 1
651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
652                        sql_query_genotype_df.to_dict(orient="index")
653                    )
654
655            stats["Samples"] = samples
656            stats["Infos"]["Number of samples"] = nb_of_samples
657
658        # #
659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
660        #     stats["Infos"]["Number of samples"] = nb_of_samples
661        # elif nb_of_samples:
662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
663
664        ### INFO and FORMAT fields
665        header_types_df = {}
666        header_types_list = {
667            "List of INFO fields": header_infos,
668            "List of FORMAT fields": header_formats,
669        }
670        i = 0
671        for header_type in header_types_list:
672
673            header_type_infos = header_types_list.get(header_type)
674            header_infos_dict = {}
675
676            for info in header_type_infos:
677
678                i += 1
679                header_infos_dict[i] = {}
680
681                # ID
682                header_infos_dict[i]["id"] = info
683
684                # num
685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
686                if header_type_infos[info].num in genotype_map.keys():
687                    header_infos_dict[i]["Number"] = genotype_map.get(
688                        header_type_infos[info].num
689                    )
690                else:
691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
692
693                # type
694                if header_type_infos[info].type:
695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
696                else:
697                    header_infos_dict[i]["Type"] = "."
698
699                # desc
700                if header_type_infos[info].desc != None:
701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
702                else:
703                    header_infos_dict[i]["Description"] = ""
704
705            if len(header_infos_dict):
706                header_types_df[header_type] = pd.DataFrame.from_dict(
707                    header_infos_dict, orient="index"
708                ).to_dict(orient="index")
709
710        # Stats
711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
713        stats["Header"] = header_types_df
714
715        ### QUAL
716        if "QUAL" in self.get_header_columns():
717            sql_query_qual = f"""
718                    SELECT
719                        avg(CAST(QUAL AS INTEGER)) AS Average,
720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
723                        median(CAST(QUAL AS INTEGER)) AS Median,
724                        variance(CAST(QUAL AS INTEGER)) AS Variance
725                    FROM {table_variants_from}
726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
727                    """
728
729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
730            stats["Quality"] = {"Stats": qual}
731
732        ### SNV and InDel
733
734        sql_query_snv = f"""
735            
736            SELECT Type, count FROM (
737
738                    SELECT
739                        'Total' AS Type,
740                        count(*) AS count
741                    FROM {table_variants_from}
742
743                    UNION
744
745                    SELECT
746                        'MNV' AS Type,
747                        count(*) AS count
748                    FROM {table_variants_from}
749                    WHERE len(REF) > 1 AND len(ALT) > 1
750                    AND len(REF) = len(ALT)
751
752                    UNION
753
754                    SELECT
755                        'InDel' AS Type,
756                        count(*) AS count
757                    FROM {table_variants_from}
758                    WHERE len(REF) > 1 OR len(ALT) > 1
759                    AND len(REF) != len(ALT)
760                    
761                    UNION
762
763                    SELECT
764                        'SNV' AS Type,
765                        count(*) AS count
766                    FROM {table_variants_from}
767                    WHERE len(REF) = 1 AND len(ALT) = 1
768
769                )
770
771            ORDER BY count DESC
772
773                """
774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
775
776        sql_query_snv_substitution = f"""
777                SELECT
778                    concat(REF, '>', ALT) AS 'Substitution',
779                    count(*) AS count
780                FROM {table_variants_from}
781                WHERE len(REF) = 1 AND len(ALT) = 1
782                GROUP BY REF, ALT
783                ORDER BY count(*) DESC
784                """
785        snv_substitution = (
786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
787        )
788        stats["Variants"]["Counts"] = snv_indel
789        stats["Variants"]["Substitutions"] = snv_substitution
790
791        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
793    def stats_to_file(self, file: str = None) -> str:
794        """
795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
796        into a JSON object, and writes the JSON object to the specified file.
797
798        :param file: The `file` parameter is a string that represents the file path where the JSON data
799        will be written
800        :type file: str
801        :return: the name of the file that was written to.
802        """
803
804        # Get stats
805        stats = self.get_stats()
806
807        # Serializing json
808        json_object = json.dumps(stats, indent=4)
809
810        # Writing to sample.json
811        with open(file, "w") as outfile:
812            outfile.write(json_object)
813
814        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
817        """
818        The `print_stats` function generates a markdown file and prints the statistics contained in a
819        JSON file in a formatted manner.
820
821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
823        provided, a temporary directory will be created and the stats will be saved in a file named
824        "stats.md" within that
825        :type output_file: str
826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
827        file where the statistics will be saved. If no value is provided, a temporary directory will be
828        created and a default file name "stats.json" will be used
829        :type json_file: str
830        :return: The function `print_stats` does not return any value. It has a return type annotation
831        of `None`.
832        """
833
834        # Full path
835        output_file = full_path(output_file)
836        json_file = full_path(json_file)
837
838        with tempfile.TemporaryDirectory() as tmpdir:
839
840            # Files
841            if not output_file:
842                output_file = os.path.join(tmpdir, "stats.md")
843            if not json_file:
844                json_file = os.path.join(tmpdir, "stats.json")
845
846            # Create folders
847            if not os.path.exists(os.path.dirname(output_file)):
848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
849            if not os.path.exists(os.path.dirname(json_file)):
850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
851
852            # Create stats JSON file
853            stats_file = self.stats_to_file(file=json_file)
854
855            # Print stats file
856            with open(stats_file) as f:
857                stats = yaml.safe_load(f)
858
859            # Output
860            output_title = []
861            output_index = []
862            output = []
863
864            # Title
865            output_title.append("# HOWARD Stats")
866
867            # Index
868            output_index.append("## Index")
869
870            # Process sections
871            for section in stats:
872                infos = stats.get(section)
873                section_link = "#" + section.lower().replace(" ", "-")
874                output.append(f"## {section}")
875                output_index.append(f"- [{section}]({section_link})")
876
877                if len(infos):
878                    for info in infos:
879                        try:
880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
881                            is_df = True
882                        except:
883                            try:
884                                df = pd.DataFrame.from_dict(
885                                    json.loads((infos.get(info))), orient="index"
886                                )
887                                is_df = True
888                            except:
889                                is_df = False
890                        if is_df:
891                            output.append(f"### {info}")
892                            info_link = "#" + info.lower().replace(" ", "-")
893                            output_index.append(f"   - [{info}]({info_link})")
894                            output.append(f"{df.to_markdown(index=False)}")
895                        else:
896                            output.append(f"- {info}: {infos.get(info)}")
897                else:
898                    output.append(f"NA")
899
900            # Write stats in markdown file
901            with open(output_file, "w") as fp:
902                for item in output_title:
903                    fp.write("%s\n" % item)
904                for item in output_index:
905                    fp.write("%s\n" % item)
906                for item in output:
907                    fp.write("%s\n" % item)
908
909            # Output stats in markdown
910            print("")
911            print("\n\n".join(output_title))
912            print("")
913            print("\n\n".join(output))
914            print("")
915
916        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
918    def get_input(self) -> str:
919        """
920        It returns the value of the input variable.
921        :return: The input is being returned.
922        """
923        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
925    def get_input_format(self, input_file: str = None) -> str:
926        """
927        This function returns the format of the input variable, either from the provided input file or
928        by prompting for input.
929
930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
931        represents the file path of the input file. If no `input_file` is provided when calling the
932        method, it will default to `None`
933        :type input_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not input_file:
938            input_file = self.get_input()
939        input_format = get_file_format(input_file)
940        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
942    def get_input_compressed(self, input_file: str = None) -> str:
943        """
944        The function `get_input_compressed` returns the format of the input variable after compressing
945        it.
946
947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
948        that represents the file path of the input file. If no `input_file` is provided when calling the
949        method, it will default to `None` and the method will then call `self.get_input()` to
950        :type input_file: str
951        :return: The function `get_input_compressed` returns the compressed format of the input
952        variable.
953        """
954
955        if not input_file:
956            input_file = self.get_input()
957        input_compressed = get_file_compressed(input_file)
958        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
960    def get_output(self) -> str:
961        """
962        It returns the output of the neuron.
963        :return: The output of the neural network.
964        """
965
966        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
968    def get_output_format(self, output_file: str = None) -> str:
969        """
970        The function `get_output_format` returns the format of the input variable or the output file if
971        provided.
972
973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
974        that represents the file path of the output file. If no `output_file` is provided when calling
975        the method, it will default to the output obtained from the `get_output` method of the class
976        instance. The
977        :type output_file: str
978        :return: The format of the input variable is being returned.
979        """
980
981        if not output_file:
982            output_file = self.get_output()
983        output_format = get_file_format(output_file)
984
985        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
987    def get_config(self) -> dict:
988        """
989        It returns the config
990        :return: The config variable is being returned.
991        """
992        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
994    def get_param(self) -> dict:
995        """
996        It returns the param
997        :return: The param variable is being returned.
998        """
999        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1001    def get_connexion_db(self) -> str:
1002        """
1003        It returns the connexion_db attribute of the object
1004        :return: The connexion_db is being returned.
1005        """
1006        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1008    def get_prefix(self) -> str:
1009        """
1010        It returns the prefix of the object.
1011        :return: The prefix is being returned.
1012        """
1013        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1015    def get_table_variants(self, clause: str = "select") -> str:
1016        """
1017        This function returns the table_variants attribute of the object
1018
1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1020        defaults to select (optional)
1021        :return: The table_variants attribute of the object.
1022        """
1023
1024        # Access
1025        access = self.get_config().get("access", None)
1026
1027        # Clauses "select", "where", "update"
1028        if clause in ["select", "where", "update"]:
1029            table_variants = self.table_variants
1030        # Clause "from"
1031        elif clause in ["from"]:
1032            # For Read Only
1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1034                input_file = self.get_input()
1035                table_variants = f"'{input_file}' as variants"
1036            # For Read Write
1037            else:
1038                table_variants = f"{self.table_variants} as variants"
1039        else:
1040            table_variants = self.table_variants
1041        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1043    def get_tmp_dir(self) -> str:
1044        """
1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
1046        parameters or a default path.
1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1048        configuration, parameters, and a default value of "/tmp".
1049        """
1050
1051        return get_tmp(
1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1053        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1055    def get_connexion_type(self) -> str:
1056        """
1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1058
1059        :return: The connexion type is being returned.
1060        """
1061        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1063    def get_connexion(self):
1064        """
1065        It returns the connection object
1066
1067        :return: The connection object.
1068        """
1069        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1071    def close_connexion(self) -> None:
1072        """
1073        This function closes the connection to the database.
1074        :return: The connection is being closed.
1075        """
1076        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1078    def get_header(self, type: str = "vcf"):
1079        """
1080        This function returns the header of the VCF file as a list of strings
1081
1082        :param type: the type of header you want to get, defaults to vcf (optional)
1083        :return: The header of the vcf file.
1084        """
1085
1086        if self.header_vcf:
1087            if type == "vcf":
1088                return self.header_vcf
1089            elif type == "list":
1090                return self.header_list
1091        else:
1092            if type == "vcf":
1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1094                return header
1095            elif type == "list":
1096                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1098    def get_header_infos_list(self) -> list:
1099        """
1100        This function retrieves a list of information fields from the header.
1101        :return: A list of information fields from the header.
1102        """
1103
1104        # Init
1105        infos_list = []
1106
1107        for field in self.get_header().infos:
1108            infos_list.append(field)
1109
1110        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1112    def get_header_length(self, file: str = None) -> int:
1113        """
1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1115        line.
1116
1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1118        header file. If this argument is provided, the function will read the header from the specified
1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1120        :type file: str
1121        :return: the length of the header list, excluding the #CHROM line.
1122        """
1123
1124        if file:
1125            return len(self.read_vcf_header_file(file=file)) - 1
1126        elif self.get_header(type="list"):
1127            return len(self.get_header(type="list")) - 1
1128        else:
1129            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1131    def get_header_columns(self) -> str:
1132        """
1133        This function returns the header list of a VCF
1134
1135        :return: The length of the header list.
1136        """
1137        if self.get_header():
1138            return self.get_header(type="list")[-1]
1139        else:
1140            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1142    def get_header_columns_as_list(self) -> list:
1143        """
1144        This function returns the header list of a VCF
1145
1146        :return: The length of the header list.
1147        """
1148        if self.get_header():
1149            return self.get_header_columns().strip().split("\t")
1150        else:
1151            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1153    def get_header_columns_as_sql(self) -> str:
1154        """
1155        This function retruns header length (without #CHROM line)
1156
1157        :return: The length of the header list.
1158        """
1159        sql_column_list = []
1160        for col in self.get_header_columns_as_list():
1161            sql_column_list.append(f'"{col}"')
1162        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1164    def get_header_sample_list(
1165        self, check: bool = False, samples: list = None, samples_force: bool = False
1166    ) -> list:
1167        """
1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1169        checking and filtering based on input parameters.
1170
1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1172        parameter that determines whether to check if the samples in the list are properly defined as
1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1174        list is defined as a, defaults to False
1175        :type check: bool (optional)
1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1177        allows you to specify a subset of samples from the header. If you provide a list of sample
1178        names, the function will check if each sample is defined in the header. If a sample is not found
1179        in the
1180        :type samples: list
1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1182        a boolean parameter that determines whether to force the function to return the sample list
1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1184        function will return the sample list without performing, defaults to False
1185        :type samples_force: bool (optional)
1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
1187        parameters and conditions specified in the function.
1188        """
1189
1190        # Init
1191        samples_list = []
1192
1193        if samples is None:
1194            samples_list = self.header_vcf.samples
1195        else:
1196            samples_checked = []
1197            for sample in samples:
1198                if sample in self.header_vcf.samples:
1199                    samples_checked.append(sample)
1200                else:
1201                    log.warning(f"Sample '{sample}' not defined in header")
1202            samples_list = samples_checked
1203
1204            # Force sample list without checking if is_genotype_column
1205            if samples_force:
1206                log.warning(f"Samples {samples_list} not checked if genotypes")
1207                return samples_list
1208
1209        if check:
1210            samples_checked = []
1211            for sample in samples_list:
1212                if self.is_genotype_column(column=sample):
1213                    samples_checked.append(sample)
1214                else:
1215                    log.warning(
1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1217                    )
1218            samples_list = samples_checked
1219
1220        # Return samples list
1221        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1223    def is_genotype_column(self, column: str = None) -> bool:
1224        """
1225        This function checks if a given column is a genotype column in a database.
1226
1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1228        represents the column name in a database table. This method checks if the specified column is a
1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1230        method of
1231        :type column: str
1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1234        column name and returns the result. If the `column` parameter is None, it returns False.
1235        """
1236
1237        if column is not None:
1238            return Database(database=self.get_input()).is_genotype_column(column=column)
1239        else:
1240            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1242    def get_verbose(self) -> bool:
1243        """
1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1245        exist
1246
1247        :return: The value of the key "verbose" in the config dictionary.
1248        """
1249        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1251    def get_connexion_format(self) -> str:
1252        """
1253        It returns the connexion format of the object.
1254        :return: The connexion_format is being returned.
1255        """
1256        connexion_format = self.connexion_format
1257        if connexion_format not in ["duckdb", "sqlite"]:
1258            log.error(f"Unknown connexion format {connexion_format}")
1259            raise ValueError(f"Unknown connexion format {connexion_format}")
1260        else:
1261            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1263    def insert_file_to_table(
1264        self,
1265        file,
1266        columns: str,
1267        header_len: int = 0,
1268        sep: str = "\t",
1269        chunksize: int = 1000000,
1270    ) -> None:
1271        """
1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
1273        database format.
1274
1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
1276        the path to the file on your system
1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1278        should contain the names of the columns in the table where the data will be inserted. The column
1279        names should be separated by commas within the string. For example, if you have columns named
1280        "id", "name
1281        :type columns: str
1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1283        the number of lines to skip at the beginning of the file before reading the actual data. This
1284        parameter allows you to skip any header information present in the file before processing the
1285        data, defaults to 0
1286        :type header_len: int (optional)
1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1288        separator character that is used in the file being read. In this case, the default separator is
1289        set to `\t`, which represents a tab character. You can change this parameter to a different
1290        separator character if, defaults to \t
1291        :type sep: str (optional)
1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1293        when processing the file in chunks. In the provided code snippet, the default value for
1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1295        to 1000000
1296        :type chunksize: int (optional)
1297        """
1298
1299        # Config
1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1301        connexion_format = self.get_connexion_format()
1302
1303        log.debug("chunksize: " + str(chunksize))
1304
1305        if chunksize:
1306            for chunk in pd.read_csv(
1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1308            ):
1309                if connexion_format in ["duckdb"]:
1310                    sql_insert_into = (
1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1312                    )
1313                    self.conn.execute(sql_insert_into)
1314                elif connexion_format in ["sqlite"]:
1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1317    def load_data(
1318        self,
1319        input_file: str = None,
1320        drop_variants_table: bool = False,
1321        sample_size: int = 20480,
1322    ) -> None:
1323        """
1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1325        table before loading the data and specify a sample size.
1326
1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1328        table
1329        :type input_file: str
1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1331        determines whether the variants table should be dropped before loading the data. If set to
1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1333        not be dropped, defaults to False
1334        :type drop_variants_table: bool (optional)
1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1337        20480
1338        :type sample_size: int (optional)
1339        """
1340
1341        log.info("Loading...")
1342
1343        # change input file
1344        if input_file:
1345            self.set_input(input_file)
1346            self.set_header()
1347
1348        # drop variants table
1349        if drop_variants_table:
1350            self.drop_variants_table()
1351
1352        # get table variants
1353        table_variants = self.get_table_variants()
1354
1355        # Access
1356        access = self.get_config().get("access", None)
1357        log.debug(f"access: {access}")
1358
1359        # Input format and compress
1360        input_format = self.get_input_format()
1361        input_compressed = self.get_input_compressed()
1362        log.debug(f"input_format: {input_format}")
1363        log.debug(f"input_compressed: {input_compressed}")
1364
1365        # input_compressed_format
1366        if input_compressed:
1367            input_compressed_format = "gzip"
1368        else:
1369            input_compressed_format = "none"
1370        log.debug(f"input_compressed_format: {input_compressed_format}")
1371
1372        # Connexion format
1373        connexion_format = self.get_connexion_format()
1374
1375        # Sample size
1376        if not sample_size:
1377            sample_size = -1
1378        log.debug(f"sample_size: {sample_size}")
1379
1380        # Load data
1381        log.debug(f"Load Data from {input_format}")
1382
1383        # DuckDB connexion
1384        if connexion_format in ["duckdb"]:
1385
1386            # Database already exists
1387            if self.input_format in ["db", "duckdb"]:
1388
1389                if connexion_format in ["duckdb"]:
1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
1391                else:
1392                    log.error(
1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1394                    )
1395                    raise ValueError(
1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1397                    )
1398
1399            # Load from existing database format
1400            else:
1401
1402                try:
1403                    # Create Table or View
1404                    database = Database(database=self.input)
1405                    sql_from = database.get_sql_from(sample_size=sample_size)
1406
1407                    if access in ["RO"]:
1408                        sql_load = (
1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1410                        )
1411                    else:
1412                        sql_load = (
1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1414                        )
1415                    self.conn.execute(sql_load)
1416
1417                except:
1418                    # Format not available
1419                    log.error(f"Input file format '{self.input_format}' not available")
1420                    raise ValueError(
1421                        f"Input file format '{self.input_format}' not available"
1422                    )
1423
1424        # SQLite connexion
1425        elif connexion_format in ["sqlite"] and input_format in [
1426            "vcf",
1427            "tsv",
1428            "csv",
1429            "psv",
1430        ]:
1431
1432            # Main structure
1433            structure = {
1434                "#CHROM": "VARCHAR",
1435                "POS": "INTEGER",
1436                "ID": "VARCHAR",
1437                "REF": "VARCHAR",
1438                "ALT": "VARCHAR",
1439                "QUAL": "VARCHAR",
1440                "FILTER": "VARCHAR",
1441                "INFO": "VARCHAR",
1442            }
1443
1444            # Strcuture with samples
1445            structure_complete = structure
1446            if self.get_header_sample_list():
1447                structure["FORMAT"] = "VARCHAR"
1448                for sample in self.get_header_sample_list():
1449                    structure_complete[sample] = "VARCHAR"
1450
1451            # Columns list for create and insert
1452            sql_create_table_columns = []
1453            sql_create_table_columns_list = []
1454            for column in structure_complete:
1455                column_type = structure_complete[column]
1456                sql_create_table_columns.append(
1457                    f'"{column}" {column_type} default NULL'
1458                )
1459                sql_create_table_columns_list.append(f'"{column}"')
1460
1461            # Create database
1462            log.debug(f"Create Table {table_variants}")
1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1466            self.conn.execute(sql_create_table)
1467
1468            # chunksize define length of file chunk load file
1469            chunksize = 100000
1470
1471            # delimiter
1472            delimiter = file_format_delimiters.get(input_format, "\t")
1473
1474            # Load the input file
1475            with open(self.input, "rt") as input_file:
1476
1477                # Use the appropriate file handler based on the input format
1478                if input_compressed:
1479                    input_file = bgzf.open(self.input, "rt")
1480                if input_format in ["vcf"]:
1481                    header_len = self.get_header_length()
1482                else:
1483                    header_len = 0
1484
1485                # Insert the file contents into a table
1486                self.insert_file_to_table(
1487                    input_file,
1488                    columns=sql_create_table_columns_list_sql,
1489                    header_len=header_len,
1490                    sep=delimiter,
1491                    chunksize=chunksize,
1492                )
1493
1494        else:
1495            log.error(
1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1497            )
1498            raise ValueError(
1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1500            )
1501
1502        # Explode INFOS fields into table fields
1503        if self.get_explode_infos():
1504            self.explode_infos(
1505                prefix=self.get_explode_infos_prefix(),
1506                fields=self.get_explode_infos_fields(),
1507                force=True,
1508            )
1509
1510        # Create index after insertion
1511        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1513    def get_explode_infos(self) -> bool:
1514        """
1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1516        to False if it is not set.
1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1518        value. If the parameter is not present, it will return False.
1519        """
1520
1521        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1523    def get_explode_infos_fields(
1524        self,
1525        explode_infos_fields: str = None,
1526        remove_fields_not_in_header: bool = False,
1527    ) -> list:
1528        """
1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1530        the input parameter `explode_infos_fields`.
1531
1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1534        comma-separated list of field names to explode
1535        :type explode_infos_fields: str
1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1537        flag that determines whether to remove fields that are not present in the header. If it is set
1538        to `True`, any field that is not in the header will be excluded from the list of exploded
1539        information fields. If it is set to `, defaults to False
1540        :type remove_fields_not_in_header: bool (optional)
1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
1545        splitting the string by commas.
1546        """
1547
1548        # If no fields, get it in param
1549        if not explode_infos_fields:
1550            explode_infos_fields = (
1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1552            )
1553
1554        # If no fields, defined as all fields in header using keyword
1555        if not explode_infos_fields:
1556            explode_infos_fields = "*"
1557
1558        # If fields list not empty
1559        if explode_infos_fields:
1560
1561            # Input fields list
1562            if isinstance(explode_infos_fields, str):
1563                fields_input = explode_infos_fields.split(",")
1564            elif isinstance(explode_infos_fields, list):
1565                fields_input = explode_infos_fields
1566            else:
1567                fields_input = []
1568
1569            # Fields list without * keyword
1570            fields_without_all = fields_input.copy()
1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
1572                fields_without_all.remove("*")
1573
1574            # Fields in header
1575            fields_in_header = sorted(list(set(self.get_header().infos)))
1576
1577            # Construct list of fields
1578            fields_output = []
1579            for field in fields_input:
1580
1581                # Strip field
1582                field = field.strip()
1583
1584                # format keyword * in regex
1585                if field.upper() in ["*"]:
1586                    field = ".*"
1587
1588                # Find all fields with pattern
1589                r = re.compile(field)
1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
1591
1592                # Remove fields input from search
1593                if field in fields_search:
1594                    fields_search = [field]
1595                elif fields_search != [field]:
1596                    fields_search = sorted(
1597                        list(set(fields_search).difference(fields_input))
1598                    )
1599
1600                # If field is not in header (avoid not well formatted header)
1601                if not fields_search and not remove_fields_not_in_header:
1602                    fields_search = [field]
1603
1604                # Add found fields
1605                for new_field in fields_search:
1606                    # Add field, if not already exists, and if it is in header (if asked)
1607                    if (
1608                        new_field not in fields_output
1609                        and (
1610                            not remove_fields_not_in_header
1611                            or new_field in fields_in_header
1612                        )
1613                        and new_field not in [".*"]
1614                    ):
1615                        fields_output.append(new_field)
1616
1617            return fields_output
1618
1619        else:
1620
1621            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1624        """
1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1627        not provided.
1628
1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1630        prefix to be used for exploding or expanding information
1631        :type explode_infos_prefix: str
1632        :return: the value of the variable `explode_infos_prefix`.
1633        """
1634
1635        if not explode_infos_prefix:
1636            explode_infos_prefix = (
1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1638            )
1639
1640        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1642    def add_column(
1643        self,
1644        table_name,
1645        column_name,
1646        column_type,
1647        default_value=None,
1648        drop: bool = False,
1649    ) -> dict:
1650        """
1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1652        doesn't already exist.
1653
1654        :param table_name: The name of the table to which you want to add a column
1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
1656        to the table
1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
1658        want to add to the table. It should be a string that represents the desired data type, such as
1659        "INTEGER", "TEXT", "REAL", etc
1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1661        default value for the newly added column. If a default value is provided, it will be assigned to
1662        the column for any existing rows that do not have a value for that column
1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1666        to False
1667        :type drop: bool (optional)
1668        :return: a boolean value indicating whether the column was successfully added to the table.
1669        """
1670
1671        # added
1672        added = False
1673        dropped = False
1674
1675        # Check if the column already exists in the table
1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1677        columns = self.get_query_to_df(query).columns.tolist()
1678        if column_name.upper() in [c.upper() for c in columns]:
1679            log.debug(
1680                f"The {column_name} column already exists in the {table_name} table"
1681            )
1682            if drop:
1683                self.drop_column(table_name=table_name, column_name=column_name)
1684                dropped = True
1685            else:
1686                return None
1687        else:
1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1689
1690        # Add column in table
1691        add_column_query = (
1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1693        )
1694        if default_value is not None:
1695            add_column_query += f" DEFAULT {default_value}"
1696        self.execute_query(add_column_query)
1697        added = not dropped
1698        log.debug(
1699            f"The {column_name} column was successfully added to the {table_name} table"
1700        )
1701
1702        if added:
1703            added_column = {
1704                "table_name": table_name,
1705                "column_name": column_name,
1706                "column_type": column_type,
1707                "default_value": default_value,
1708            }
1709        else:
1710            added_column = None
1711
1712        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1714    def drop_column(
1715        self, column: dict = None, table_name: str = None, column_name: str = None
1716    ) -> bool:
1717        """
1718        The `drop_column` function drops a specified column from a given table in a database and returns
1719        True if the column was successfully dropped, and False if the column does not exist in the
1720        table.
1721
1722        :param column: The `column` parameter is a dictionary that contains information about the column
1723        you want to drop. It has two keys:
1724        :type column: dict
1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
1726        drop a column
1727        :type table_name: str
1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1729        from the table
1730        :type column_name: str
1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1732        and False if the column does not exist in the table.
1733        """
1734
1735        # Find column infos
1736        if column:
1737            if isinstance(column, dict):
1738                table_name = column.get("table_name", None)
1739                column_name = column.get("column_name", None)
1740            elif isinstance(column, str):
1741                table_name = self.get_table_variants()
1742                column_name = column
1743            else:
1744                table_name = None
1745                column_name = None
1746
1747        if not table_name and not column_name:
1748            return False
1749
1750        # Removed
1751        removed = False
1752
1753        # Check if the column already exists in the table
1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1755        columns = self.get_query_to_df(query).columns.tolist()
1756        if column_name in columns:
1757            log.debug(f"The {column_name} column exists in the {table_name} table")
1758        else:
1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1760            return False
1761
1762        # Add column in table # ALTER TABLE integers DROP k
1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1764        self.execute_query(add_column_query)
1765        removed = True
1766        log.debug(
1767            f"The {column_name} column was successfully dropped to the {table_name} table"
1768        )
1769
1770        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1772    def explode_infos(
1773        self,
1774        prefix: str = None,
1775        create_index: bool = False,
1776        fields: list = None,
1777        force: bool = False,
1778        proccess_all_fields_together: bool = False,
1779        table: str = None,
1780    ) -> list:
1781        """
1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1783        individual columns, returning a list of added columns.
1784
1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1787        `self.get_explode_infos_prefix()` as the prefix
1788        :type prefix: str
1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1791        `False`, indexes will not be created. The default value is `False`, defaults to False
1792        :type create_index: bool (optional)
1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1796        a list to the `
1797        :type fields: list
1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1801        defaults to False
1802        :type force: bool (optional)
1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1804        flag that determines whether to process all the INFO fields together or individually. If set to
1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1806        be processed individually. The default value is, defaults to False
1807        :type proccess_all_fields_together: bool (optional)
1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
1810        a value for the `table` parameter, the function will use that table name. If the `table`
1811        parameter is
1812        :type table: str
1813        :return: The `explode_infos` function returns a list of added columns.
1814        """
1815
1816        # drop indexes
1817        self.drop_indexes()
1818
1819        # connexion format
1820        connexion_format = self.get_connexion_format()
1821
1822        # Access
1823        access = self.get_config().get("access", None)
1824
1825        # Added columns
1826        added_columns = []
1827
1828        if access not in ["RO"]:
1829
1830            # prefix
1831            if prefix in [None, True] or not isinstance(prefix, str):
1832                if self.get_explode_infos_prefix() not in [None, True]:
1833                    prefix = self.get_explode_infos_prefix()
1834                else:
1835                    prefix = "INFO/"
1836
1837            # table variants
1838            if table is not None:
1839                table_variants = table
1840            else:
1841                table_variants = self.get_table_variants(clause="select")
1842
1843            # extra infos
1844            try:
1845                extra_infos = self.get_extra_infos()
1846            except:
1847                extra_infos = []
1848
1849            # Header infos
1850            header_infos = self.get_header().infos
1851
1852            log.debug(
1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1854            )
1855
1856            sql_info_alter_table_array = []
1857
1858            # Info fields to check
1859            fields_list = list(header_infos)
1860            if fields:
1861                fields_list += fields
1862            fields_list = set(fields_list)
1863
1864            # If no fields
1865            if not fields:
1866                fields = []
1867
1868            # Translate fields if patterns
1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1870
1871            for info in fields:
1872
1873                info_id_sql = prefix + info
1874
1875                if (
1876                    info in fields_list
1877                    or prefix + info in fields_list
1878                    or info in extra_infos
1879                ):
1880
1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1882
1883                    if info in header_infos:
1884                        info_type = header_infos[info].type
1885                        info_num = header_infos[info].num
1886                    else:
1887                        info_type = "String"
1888                        info_num = 0
1889
1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1891                    if info_num != 1:
1892                        type_sql = "VARCHAR"
1893
1894                    # Add field
1895                    added_column = self.add_column(
1896                        table_name=table_variants,
1897                        column_name=info_id_sql,
1898                        column_type=type_sql,
1899                        default_value="null",
1900                        drop=force,
1901                    )
1902
1903                    if added_column:
1904                        added_columns.append(added_column)
1905
1906                    if added_column or force:
1907
1908                        # add field to index
1909                        self.index_additionnal_fields.append(info_id_sql)
1910
1911                        # Update field array
1912                        if connexion_format in ["duckdb"]:
1913                            update_info_field = f"""
1914                            "{info_id_sql}" =
1915                                CASE
1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1918                                END
1919                            """
1920                        elif connexion_format in ["sqlite"]:
1921                            update_info_field = f"""
1922                                "{info_id_sql}" =
1923                                    CASE
1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1927                                    END
1928                            """
1929
1930                        sql_info_alter_table_array.append(update_info_field)
1931
1932            if sql_info_alter_table_array:
1933
1934                # By chromosomes
1935                try:
1936                    chromosomes_list = list(
1937                        self.get_query_to_df(
1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1939                        )["#CHROM"]
1940                    )
1941                except:
1942                    chromosomes_list = [None]
1943
1944                for chrom in chromosomes_list:
1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1946
1947                    # Where clause
1948                    where_clause = ""
1949                    if chrom and len(chromosomes_list) > 1:
1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1951
1952                    # Update table
1953                    if proccess_all_fields_together:
1954                        sql_info_alter_table_array_join = ", ".join(
1955                            sql_info_alter_table_array
1956                        )
1957                        if sql_info_alter_table_array_join:
1958                            sql_info_alter_table = f"""
1959                                UPDATE {table_variants}
1960                                SET {sql_info_alter_table_array_join}
1961                                {where_clause}
1962                                """
1963                            log.debug(
1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1965                            )
1966                            # log.debug(sql_info_alter_table)
1967                            self.conn.execute(sql_info_alter_table)
1968                    else:
1969                        sql_info_alter_num = 0
1970                        for sql_info_alter in sql_info_alter_table_array:
1971                            sql_info_alter_num += 1
1972                            sql_info_alter_table = f"""
1973                                UPDATE {table_variants}
1974                                SET {sql_info_alter}
1975                                {where_clause}
1976                                """
1977                            log.debug(
1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1979                            )
1980                            # log.debug(sql_info_alter_table)
1981                            self.conn.execute(sql_info_alter_table)
1982
1983        # create indexes
1984        if create_index:
1985            self.create_indexes()
1986
1987        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1989    def create_indexes(self) -> None:
1990        """
1991        Create indexes on the table after insertion
1992        """
1993
1994        # Access
1995        access = self.get_config().get("access", None)
1996
1997        # get table variants
1998        table_variants = self.get_table_variants("FROM")
1999
2000        if self.get_indexing() and access not in ["RO"]:
2001            # Create index
2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2003            self.conn.execute(sql_create_table_index)
2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2005            self.conn.execute(sql_create_table_index)
2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2007            self.conn.execute(sql_create_table_index)
2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2009            self.conn.execute(sql_create_table_index)
2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2011            self.conn.execute(sql_create_table_index)
2012            for field in self.index_additionnal_fields:
2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2014                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2016    def drop_indexes(self) -> None:
2017        """
2018        Create indexes on the table after insertion
2019        """
2020
2021        # Access
2022        access = self.get_config().get("access", None)
2023
2024        # get table variants
2025        table_variants = self.get_table_variants("FROM")
2026
2027        # Get database format
2028        connexion_format = self.get_connexion_format()
2029
2030        if access not in ["RO"]:
2031            if connexion_format in ["duckdb"]:
2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2033            elif connexion_format in ["sqlite"]:
2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2035
2036            list_indexes = self.conn.execute(sql_list_indexes)
2037            index_names = [row[0] for row in list_indexes.fetchall()]
2038            for index in index_names:
2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2040                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2042    def read_vcf_header(self, f) -> list:
2043        """
2044        It reads the header of a VCF file and returns a list of the header lines
2045
2046        :param f: the file object
2047        :return: The header lines of the VCF file.
2048        """
2049
2050        header_list = []
2051        for line in f:
2052            header_list.append(line)
2053            if line.startswith("#CHROM"):
2054                break
2055        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2057    def read_vcf_header_file(self, file: str = None) -> list:
2058        """
2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2060        uncompressed files.
2061
2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2064        default to `None`
2065        :type file: str
2066        :return: The function `read_vcf_header_file` returns a list.
2067        """
2068
2069        if self.get_input_compressed(input_file=file):
2070            with bgzf.open(file, "rt") as f:
2071                return self.read_vcf_header(f=f)
2072        else:
2073            with open(file, "rt") as f:
2074                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2076    def execute_query(self, query: str):
2077        """
2078        It takes a query as an argument, executes it, and returns the results
2079
2080        :param query: The query to be executed
2081        :return: The result of the query is being returned.
2082        """
2083        if query:
2084            return self.conn.execute(query)  # .fetchall()
2085        else:
2086            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None, fields_to_rename: dict | None = None) -> bool:
2088    def export_output(
2089        self,
2090        output_file: str | None = None,
2091        output_header: str | None = None,
2092        export_header: bool = True,
2093        query: str | None = None,
2094        parquet_partitions: list | None = None,
2095        chunk_size: int | None = None,
2096        threads: int | None = None,
2097        sort: bool = False,
2098        index: bool = False,
2099        order_by: str | None = None,
2100        fields_to_rename: dict | None = None
2101    ) -> bool:
2102        """
2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
2105        partitioning.
2106        
2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
2108        output file where the exported data will be saved
2109        :type output_file: str | None
2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
2112        header will be exported to a file with the same name as the `output_file` parameter, but with
2113        the extension "
2114        :type output_header: str | None
2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2117        True, the header will be exported to a file. If `export_header` is False, the header will not
2118        be, defaults to True
2119        :type export_header: bool (optional)
2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
2121        that can be used to filter and select specific data from the VCF file before exporting it. If
2122        provided, only the data that matches the query will be exported. This allows you to customize
2123        the exported data based on
2124        :type query: str | None
2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2127        organize data in a hierarchical directory structure based on the values of one or more columns.
2128        This can improve query performance when working with large datasets
2129        :type parquet_partitions: list | None
2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
2132        multiple files. It helps in optimizing the export process by breaking down the data into
2133        manageable chunks for processing and storage
2134        :type chunk_size: int | None
2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
2136        threads to be used during the export process. It determines the level of parallelism and can
2137        improve the performance of the export operation. If this parameter is not provided, the function
2138        will use the default number of threads
2139        :type threads: int | None
2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
2141        determines whether the output file should be sorted based on genomic coordinates of the
2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
2143        `False`,, defaults to False
2144        :type sort: bool (optional)
2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
2146        determines whether an index should be created on the output file. If `index` is set to `True`,
2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
2148        :type index: bool (optional)
2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
2152        output file should be
2153        :type order_by: str | None
2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
2155        mapping of field names to be renamed during the export process. This parameter allows you to
2156        customize the output field names before exporting the data. Each key-value pair in the
2157        dictionary represents the original field name as the key and the new field name
2158        :type fields_to_rename: dict | None
2159        :return: The `export_output` function returns a boolean value. It checks if the output file
2160        exists and returns True if it does, or None if it doesn't.
2161        """
2162
2163        # Log
2164        log.info("Exporting...")
2165
2166        # Full path
2167        output_file = full_path(output_file)
2168        output_header = full_path(output_header)
2169
2170        # Config
2171        config = self.get_config()
2172
2173        # Param
2174        param = self.get_param()
2175
2176        # Tmp files to remove
2177        tmp_to_remove = []
2178
2179        # If no output, get it
2180        if not output_file:
2181            output_file = self.get_output()
2182
2183        # If not threads
2184        if not threads:
2185            threads = self.get_threads()
2186
2187        # Rename fields
2188        if not fields_to_rename:
2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
2191
2192        # Auto header name with extension
2193        if export_header or output_header:
2194            if not output_header:
2195                output_header = f"{output_file}.hdr"
2196            # Export header
2197            self.export_header(output_file=output_file)
2198
2199        # Switch off export header if VCF output
2200        output_file_type = get_file_format(output_file)
2201        if output_file_type in ["vcf"]:
2202            export_header = False
2203            tmp_to_remove.append(output_header)
2204
2205        # Chunk size
2206        if not chunk_size:
2207            chunk_size = config.get("chunk_size", None)
2208
2209        # Parquet partition
2210        if not parquet_partitions:
2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2212        if parquet_partitions and isinstance(parquet_partitions, str):
2213            parquet_partitions = parquet_partitions.split(",")
2214
2215        # Order by
2216        if not order_by:
2217            order_by = param.get("export", {}).get("order_by", "")
2218
2219        # Header in output
2220        header_in_output = param.get("export", {}).get("include_header", False)
2221
2222        # Database
2223        database_source = self.get_connexion()
2224
2225        # Connexion format
2226        connexion_format = self.get_connexion_format()
2227
2228        # Explode infos
2229        if self.get_explode_infos():
2230            self.explode_infos(
2231                prefix=self.get_explode_infos_prefix(),
2232                fields=self.get_explode_infos_fields(),
2233                force=False,
2234            )
2235
2236        # if connexion_format in ["sqlite"] or query:
2237        if connexion_format in ["sqlite"]:
2238
2239            # Export in Parquet
2240            random_tmp = "".join(
2241                random.choice(string.ascii_lowercase) for i in range(10)
2242            )
2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2244            tmp_to_remove.append(database_source)
2245
2246            # Table Variants
2247            table_variants = self.get_table_variants()
2248
2249            # Create export query
2250            sql_query_export_subquery = f"""
2251                SELECT * FROM {table_variants}
2252                """
2253
2254            # Write source file
2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2256
2257        # Create database
2258        database = Database(
2259            database=database_source,
2260            table="variants",
2261            header_file=output_header,
2262            conn_config=self.get_connexion_config(),
2263        )
2264
2265        # Existing colomns header
2266        existing_columns_header = database.get_header_columns_from_database(query=query)
2267
2268        # Sample list
2269        if output_file_type in ["vcf"]:
2270            get_samples = self.get_samples()
2271            get_samples_check = self.get_samples_check()
2272            samples_force = get_samples is not None
2273            sample_list = self.get_header_sample_list(
2274                check=get_samples_check,
2275                samples=get_samples,
2276                samples_force=samples_force,
2277            )
2278        else:
2279            sample_list = None
2280
2281        # Export file
2282        database.export(
2283            output_database=output_file,
2284            output_header=output_header,
2285            existing_columns_header=existing_columns_header,
2286            parquet_partitions=parquet_partitions,
2287            chunk_size=chunk_size,
2288            threads=threads,
2289            sort=sort,
2290            index=index,
2291            header_in_output=header_in_output,
2292            order_by=order_by,
2293            query=query,
2294            export_header=export_header,
2295            sample_list=sample_list,
2296        )
2297
2298        # Remove
2299        remove_if_exists(tmp_to_remove)
2300
2301        return (os.path.exists(output_file) or None) and (
2302            os.path.exists(output_file) or None
2303        )

The export_output function exports data from a VCF file to various formats, including VCF, CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and partitioning.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True
  • query: The query parameter in the export_output function is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage
  • threads: The threads parameter in the export_output function specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads
  • sort: The sort parameter in the export_output function is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. If sort is set to True, the output file will be sorted. If sort is set to False,, defaults to False
  • index: The index parameter in the export_output function is a boolean flag that determines whether an index should be created on the output file. If index is set to True, an index will be created on the output file. If index is set to False, no, defaults to False
  • order_by: The order_by parameter in the export_output function is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be
  • fields_to_rename: The fields_to_rename parameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns

The export_output function returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2305    def get_extra_infos(self, table: str = None) -> list:
2306        """
2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2308        in the header.
2309
2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2311        name of the table from which you want to retrieve the extra columns that are not present in the
2312        header. If the `table` parameter is not provided when calling the function, it will default to
2313        using the variants
2314        :type table: str
2315        :return: A list of columns that are in the specified table but not in the header of the table.
2316        """
2317
2318        header_columns = []
2319
2320        if not table:
2321            table = self.get_table_variants(clause="from")
2322            header_columns = self.get_header_columns()
2323
2324        # Check all columns in the database
2325        query = f""" SELECT * FROM {table} LIMIT 1 """
2326        log.debug(f"query {query}")
2327        table_columns = self.get_query_to_df(query).columns.tolist()
2328        extra_columns = []
2329
2330        # Construct extra infos (not in header)
2331        for column in table_columns:
2332            if column not in header_columns:
2333                extra_columns.append(column)
2334
2335        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2337    def get_extra_infos_sql(self, table: str = None) -> str:
2338        """
2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2340        by double quotes
2341
2342        :param table: The name of the table to get the extra infos from. If None, the default table is
2343        used
2344        :type table: str
2345        :return: A string of the extra infos
2346        """
2347
2348        return ", ".join(
2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2350        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2352    def export_header(
2353        self,
2354        header_name: str = None,
2355        output_file: str = None,
2356        output_file_ext: str = ".hdr",
2357        clean_header: bool = True,
2358        remove_chrom_line: bool = False,
2359    ) -> str:
2360        """
2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2362        specified options, and writes it to a new file.
2363
2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2365        this parameter is not specified, the header will be written to the output file
2366        :type header_name: str
2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
2368        specify the name of the output file where the header will be written. If this parameter is not
2369        provided, the header will be written to a temporary file
2370        :type output_file: str
2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
2373        if not specified by the user. This extension will be appended to the `output_file` name to
2374        create the final, defaults to .hdr
2375        :type output_file_ext: str (optional)
2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2378        `True`, the function will clean the header by modifying certain lines based on a specific
2379        pattern. If `clean_header`, defaults to True
2380        :type clean_header: bool (optional)
2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2382        boolean flag that determines whether the #CHROM line should be removed from the header before
2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2384        defaults to False
2385        :type remove_chrom_line: bool (optional)
2386        :return: The function `export_header` returns the name of the temporary header file that is
2387        created.
2388        """
2389
2390        if not header_name and not output_file:
2391            output_file = self.get_output()
2392
2393        if self.get_header():
2394
2395            # Get header object
2396            header_obj = self.get_header()
2397
2398            # Create database
2399            db_for_header = Database(database=self.get_input())
2400
2401            # Get real columns in the file
2402            db_header_columns = db_for_header.get_columns()
2403
2404            with tempfile.TemporaryDirectory() as tmpdir:
2405
2406                # Write header file
2407                header_file_tmp = os.path.join(tmpdir, "header")
2408                f = open(header_file_tmp, "w")
2409                vcf.Writer(f, header_obj)
2410                f.close()
2411
2412                # Replace #CHROM line with rel columns
2413                header_list = db_for_header.read_header_file(
2414                    header_file=header_file_tmp
2415                )
2416                header_list[-1] = "\t".join(db_header_columns)
2417
2418                # Remove CHROM line
2419                if remove_chrom_line:
2420                    header_list.pop()
2421
2422                # Clean header
2423                if clean_header:
2424                    header_list_clean = []
2425                    for head in header_list:
2426                        # Clean head for malformed header
2427                        head_clean = head
2428                        head_clean = re.subn(
2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2431                            head_clean,
2432                            2,
2433                        )[0]
2434                        # Write header
2435                        header_list_clean.append(head_clean)
2436                    header_list = header_list_clean
2437
2438            tmp_header_name = output_file + output_file_ext
2439
2440            f = open(tmp_header_name, "w")
2441            for line in header_list:
2442                f.write(line)
2443            f.close()
2444
2445        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2447    def export_variant_vcf(
2448        self,
2449        vcf_file,
2450        remove_info: bool = False,
2451        add_samples: bool = True,
2452        list_samples: list = [],
2453        where_clause: str = "",
2454        index: bool = False,
2455        threads: int | None = None,
2456    ) -> bool | None:
2457        """
2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2459        remove INFO field, add samples, and control compression and indexing.
2460
2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2462        written to. It is the output file that will contain the filtered VCF data based on the specified
2463        parameters
2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2467        in, defaults to False
2468        :type remove_info: bool (optional)
2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2471        If set to False, the samples will be removed. The default value is True, defaults to True
2472        :type add_samples: bool (optional)
2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2474        in the output VCF file. By default, all samples will be included. If you provide a list of
2475        samples, only those samples will be included in the output file
2476        :type list_samples: list
2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2478        determines whether or not to create an index for the output VCF file. If `index` is set to
2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2480        :type index: bool (optional)
2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
2483        will be used during the export process. More threads can potentially speed up the export process
2484        by utilizing multiple cores of the processor. If
2485        :type threads: int | None
2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2487        method with various parameters including the output file, query, threads, sort flag, and index
2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
2489        specified parameters and configurations provided in the `export_variant_vcf` function.
2490        """
2491
2492        # Config
2493        config = self.get_config()
2494
2495        # Extract VCF
2496        log.debug("Export VCF...")
2497
2498        # Table variants
2499        table_variants = self.get_table_variants()
2500
2501        # Threads
2502        if not threads:
2503            threads = self.get_threads()
2504
2505        # Info fields
2506        if remove_info:
2507            if not isinstance(remove_info, str):
2508                remove_info = "."
2509            info_field = f"""'{remove_info}' as INFO"""
2510        else:
2511            info_field = "INFO"
2512
2513        # Samples fields
2514        if add_samples:
2515            if not list_samples:
2516                list_samples = self.get_header_sample_list()
2517            if list_samples:
2518                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2519            else:
2520                samples_fields = ""
2521            log.debug(f"samples_fields: {samples_fields}")
2522        else:
2523            samples_fields = ""
2524
2525        # Where clause
2526        if where_clause is None:
2527            where_clause = ""
2528
2529        # Variants
2530        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2531        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2532        log.debug(f"sql_query_select={sql_query_select}")
2533
2534        return self.export_output(
2535            output_file=vcf_file,
2536            output_header=None,
2537            export_header=True,
2538            query=sql_query_select,
2539            parquet_partitions=None,
2540            chunk_size=config.get("chunk_size", None),
2541            threads=threads,
2542            sort=True,
2543            index=index,
2544            order_by=None,
2545        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2547    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2548        """
2549        It takes a list of commands and runs them in parallel using the number of threads specified
2550
2551        :param commands: A list of commands to run
2552        :param threads: The number of threads to use, defaults to 1 (optional)
2553        """
2554
2555        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2557    def get_threads(self, default: int = 1) -> int:
2558        """
2559        This function returns the number of threads to use for a job, with a default value of 1 if not
2560        specified.
2561
2562        :param default: The `default` parameter in the `get_threads` method is used to specify the
2563        default number of threads to use if no specific value is provided. If no value is provided for
2564        the `threads` parameter in the configuration or input parameters, the `default` value will be
2565        used, defaults to 1
2566        :type default: int (optional)
2567        :return: the number of threads to use for the current job.
2568        """
2569
2570        # Config
2571        config = self.get_config()
2572
2573        # Param
2574        param = self.get_param()
2575
2576        # Input threads
2577        input_thread = param.get("threads", config.get("threads", None))
2578
2579        # Check threads
2580        if not input_thread:
2581            threads = default
2582        elif int(input_thread) <= 0:
2583            threads = os.cpu_count()
2584        else:
2585            threads = int(input_thread)
2586        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2588    def get_memory(self, default: str = None) -> str:
2589        """
2590        This function retrieves the memory value from parameters or configuration with a default value
2591        if not found.
2592
2593        :param default: The `get_memory` function takes in a default value as a string parameter. This
2594        default value is used as a fallback in case the `memory` parameter is not provided in the
2595        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2596        the function
2597        :type default: str
2598        :return: The `get_memory` function returns a string value representing the memory parameter. If
2599        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2600        return the default value provided as an argument to the function.
2601        """
2602
2603        # Config
2604        config = self.get_config()
2605
2606        # Param
2607        param = self.get_param()
2608
2609        # Input threads
2610        input_memory = param.get("memory", config.get("memory", None))
2611
2612        # Check threads
2613        if input_memory:
2614            memory = input_memory
2615        else:
2616            memory = default
2617
2618        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2620    def update_from_vcf(self, vcf_file: str) -> None:
2621        """
2622        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2623
2624        :param vcf_file: the path to the VCF file
2625        """
2626
2627        connexion_format = self.get_connexion_format()
2628
2629        if connexion_format in ["duckdb"]:
2630            self.update_from_vcf_duckdb(vcf_file)
2631        elif connexion_format in ["sqlite"]:
2632            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2634    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2635        """
2636        It takes a VCF file and updates the INFO column of the variants table in the database with the
2637        INFO column of the VCF file
2638
2639        :param vcf_file: the path to the VCF file
2640        """
2641
2642        # varaints table
2643        table_variants = self.get_table_variants()
2644
2645        # Loading VCF into temporaire table
2646        skip = self.get_header_length(file=vcf_file)
2647        vcf_df = pd.read_csv(
2648            vcf_file,
2649            sep="\t",
2650            engine="c",
2651            skiprows=skip,
2652            header=0,
2653            low_memory=False,
2654        )
2655        sql_query_update = f"""
2656        UPDATE {table_variants} as table_variants
2657            SET INFO = concat(
2658                            CASE
2659                                WHEN INFO NOT IN ('', '.')
2660                                THEN INFO
2661                                ELSE ''
2662                            END,
2663                            (
2664                                SELECT 
2665                                    concat(
2666                                        CASE
2667                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2668                                            THEN ';'
2669                                            ELSE ''
2670                                        END
2671                                        ,
2672                                        CASE
2673                                            WHEN table_parquet.INFO NOT IN ('','.')
2674                                            THEN table_parquet.INFO
2675                                            ELSE ''
2676                                        END
2677                                    )
2678                                FROM vcf_df as table_parquet
2679                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2680                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2681                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2682                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2683                                        AND table_parquet.INFO NOT IN ('','.')
2684                            )
2685                        )
2686            ;
2687            """
2688        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2690    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2691        """
2692        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2693        table, then updates the INFO column of the variants table with the INFO column of the temporary
2694        table
2695
2696        :param vcf_file: The path to the VCF file you want to update the database with
2697        """
2698
2699        # Create a temporary table for the VCF
2700        table_vcf = "tmp_vcf"
2701        sql_create = (
2702            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2703        )
2704        self.conn.execute(sql_create)
2705
2706        # Loading VCF into temporaire table
2707        vcf_df = pd.read_csv(
2708            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2709        )
2710        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2711        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2712
2713        # Update table 'variants' with VCF data
2714        # warning: CONCAT as || operator
2715        sql_query_update = f"""
2716            UPDATE variants as table_variants
2717            SET INFO = CASE
2718                            WHEN INFO NOT IN ('', '.')
2719                            THEN INFO
2720                            ELSE ''
2721                        END ||
2722                        (
2723                        SELECT 
2724                            CASE 
2725                                WHEN table_variants.INFO NOT IN ('','.') 
2726                                    AND table_vcf.INFO NOT IN ('','.')  
2727                                THEN ';' 
2728                                ELSE '' 
2729                            END || 
2730                            CASE 
2731                                WHEN table_vcf.INFO NOT IN ('','.') 
2732                                THEN table_vcf.INFO 
2733                                ELSE '' 
2734                            END
2735                        FROM {table_vcf} as table_vcf
2736                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2737                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2738                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2739                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2740                        )
2741        """
2742        self.conn.execute(sql_query_update)
2743
2744        # Drop temporary table
2745        sql_drop = f"DROP TABLE {table_vcf}"
2746        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2748    def drop_variants_table(self) -> None:
2749        """
2750        > This function drops the variants table
2751        """
2752
2753        table_variants = self.get_table_variants()
2754        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2755        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2757    def set_variant_id(
2758        self, variant_id_column: str = "variant_id", force: bool = None
2759    ) -> str:
2760        """
2761        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2762        `#CHROM`, `POS`, `REF`, and `ALT` columns
2763
2764        :param variant_id_column: The name of the column to be created in the variants table, defaults
2765        to variant_id
2766        :type variant_id_column: str (optional)
2767        :param force: If True, the variant_id column will be created even if it already exists
2768        :type force: bool
2769        :return: The name of the column that contains the variant_id
2770        """
2771
2772        # Assembly
2773        assembly = self.get_param().get(
2774            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2775        )
2776
2777        # INFO/Tag prefix
2778        prefix = self.get_explode_infos_prefix()
2779
2780        # Explode INFO/SVTYPE
2781        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2782
2783        # variants table
2784        table_variants = self.get_table_variants()
2785
2786        # variant_id column
2787        if not variant_id_column:
2788            variant_id_column = "variant_id"
2789
2790        # Creta variant_id column
2791        if "variant_id" not in self.get_extra_infos() or force:
2792
2793            # Create column
2794            self.add_column(
2795                table_name=table_variants,
2796                column_name=variant_id_column,
2797                column_type="UBIGINT",
2798                default_value="0",
2799            )
2800
2801            # Update column
2802            self.conn.execute(
2803                f"""
2804                    UPDATE {table_variants}
2805                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2806                """
2807            )
2808
2809        # Remove added columns
2810        for added_column in added_columns:
2811            self.drop_column(column=added_column)
2812
2813        # return variant_id column name
2814        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2816    def get_variant_id_column(
2817        self, variant_id_column: str = "variant_id", force: bool = None
2818    ) -> str:
2819        """
2820        This function returns the variant_id column name
2821
2822        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2823        defaults to variant_id
2824        :type variant_id_column: str (optional)
2825        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2826        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2827        if it is not already set, or if it is set
2828        :type force: bool
2829        :return: The variant_id column name.
2830        """
2831
2832        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2838    def scan_databases(
2839        self,
2840        database_formats: list = ["parquet"],
2841        database_releases: list = ["current"],
2842    ) -> dict:
2843        """
2844        The function `scan_databases` scans for available databases based on specified formats and
2845        releases.
2846
2847        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2848        of the databases to be scanned. In this case, the accepted format is "parquet"
2849        :type database_formats: list ["parquet"]
2850        :param database_releases: The `database_releases` parameter is a list that specifies the
2851        releases of the databases to be scanned. In the provided function, the default value for
2852        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2853        databases that are in the "current"
2854        :type database_releases: list
2855        :return: The function `scan_databases` returns a dictionary containing information about
2856        databases that match the specified formats and releases.
2857        """
2858
2859        # Config
2860        config = self.get_config()
2861
2862        # Param
2863        param = self.get_param()
2864
2865        # Param - Assembly
2866        assembly = param.get("assembly", config.get("assembly", None))
2867        if not assembly:
2868            assembly = DEFAULT_ASSEMBLY
2869            log.warning(f"Default assembly '{assembly}'")
2870
2871        # Scan for availabled databases
2872        log.info(
2873            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2874        )
2875        databases_infos_dict = databases_infos(
2876            database_folder_releases=database_releases,
2877            database_formats=database_formats,
2878            assembly=assembly,
2879            config=config,
2880        )
2881        log.info(
2882            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2883        )
2884
2885        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2887    def annotation(self) -> None:
2888        """
2889        It annotates the VCF file with the annotations specified in the config file.
2890        """
2891
2892        # Config
2893        config = self.get_config()
2894
2895        # Param
2896        param = self.get_param()
2897
2898        # Param - Assembly
2899        assembly = param.get("assembly", config.get("assembly", None))
2900        if not assembly:
2901            assembly = DEFAULT_ASSEMBLY
2902            log.warning(f"Default assembly '{assembly}'")
2903
2904        # annotations databases folders
2905        annotations_databases = set(
2906            config.get("folders", {})
2907            .get("databases", {})
2908            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2909            + config.get("folders", {})
2910            .get("databases", {})
2911            .get("parquet", ["~/howard/databases/parquet/current"])
2912            + config.get("folders", {})
2913            .get("databases", {})
2914            .get("bcftools", ["~/howard/databases/bcftools/current"])
2915        )
2916
2917        # Get param annotations
2918        if param.get("annotations", None) and isinstance(
2919            param.get("annotations", None), str
2920        ):
2921            log.debug(param.get("annotations", None))
2922            param_annotation_list = param.get("annotations").split(",")
2923        else:
2924            param_annotation_list = []
2925
2926        # Each tools param
2927        if param.get("annotation_parquet", None) != None:
2928            log.debug(
2929                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2930            )
2931            if isinstance(param.get("annotation_parquet", None), list):
2932                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2933            else:
2934                param_annotation_list.append(param.get("annotation_parquet"))
2935        if param.get("annotation_snpsift", None) != None:
2936            if isinstance(param.get("annotation_snpsift", None), list):
2937                param_annotation_list.append(
2938                    "snpsift:"
2939                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2940                )
2941            else:
2942                param_annotation_list.append(
2943                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2944                )
2945        if param.get("annotation_snpeff", None) != None:
2946            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2947        if param.get("annotation_bcftools", None) != None:
2948            if isinstance(param.get("annotation_bcftools", None), list):
2949                param_annotation_list.append(
2950                    "bcftools:"
2951                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2952                )
2953            else:
2954                param_annotation_list.append(
2955                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2956                )
2957        if param.get("annotation_annovar", None) != None:
2958            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2959        if param.get("annotation_exomiser", None) != None:
2960            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2961        if param.get("annotation_splice", None) != None:
2962            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2963
2964        # Merge param annotations list
2965        param["annotations"] = ",".join(param_annotation_list)
2966
2967        # debug
2968        log.debug(f"param_annotations={param['annotations']}")
2969
2970        if param.get("annotations"):
2971
2972            # Log
2973            # log.info("Annotations - Check annotation parameters")
2974
2975            if not "annotation" in param:
2976                param["annotation"] = {}
2977
2978            # List of annotations parameters
2979            annotations_list_input = {}
2980            if isinstance(param.get("annotations", None), str):
2981                annotation_file_list = [
2982                    value for value in param.get("annotations", "").split(",")
2983                ]
2984                for annotation_file in annotation_file_list:
2985                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2986            else:
2987                annotations_list_input = param.get("annotations", {})
2988
2989            log.info(f"Quick Annotations:")
2990            for annotation_key in list(annotations_list_input.keys()):
2991                log.info(f"   {annotation_key}")
2992
2993            # List of annotations and associated fields
2994            annotations_list = {}
2995
2996            for annotation_file in annotations_list_input:
2997
2998                # Explode annotations if ALL
2999                if (
3000                    annotation_file.upper() == "ALL"
3001                    or annotation_file.upper().startswith("ALL:")
3002                ):
3003
3004                    # check ALL parameters (formats, releases)
3005                    annotation_file_split = annotation_file.split(":")
3006                    database_formats = "parquet"
3007                    database_releases = "current"
3008                    for annotation_file_option in annotation_file_split[1:]:
3009                        database_all_options_split = annotation_file_option.split("=")
3010                        if database_all_options_split[0] == "format":
3011                            database_formats = database_all_options_split[1].split("+")
3012                        if database_all_options_split[0] == "release":
3013                            database_releases = database_all_options_split[1].split("+")
3014
3015                    # Scan for availabled databases
3016                    databases_infos_dict = self.scan_databases(
3017                        database_formats=database_formats,
3018                        database_releases=database_releases,
3019                    )
3020
3021                    # Add found databases in annotation parameters
3022                    for database_infos in databases_infos_dict.keys():
3023                        annotations_list[database_infos] = {"INFO": None}
3024
3025                else:
3026                    annotations_list[annotation_file] = annotations_list_input[
3027                        annotation_file
3028                    ]
3029
3030            # Check each databases
3031            if len(annotations_list):
3032
3033                log.info(
3034                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3035                )
3036
3037                for annotation_file in annotations_list:
3038
3039                    # Init
3040                    annotations = annotations_list.get(annotation_file, None)
3041
3042                    # Annotation snpEff
3043                    if annotation_file.startswith("snpeff"):
3044
3045                        log.debug(f"Quick Annotation snpEff")
3046
3047                        if "snpeff" not in param["annotation"]:
3048                            param["annotation"]["snpeff"] = {}
3049
3050                        if "options" not in param["annotation"]["snpeff"]:
3051                            param["annotation"]["snpeff"]["options"] = ""
3052
3053                        # snpEff options in annotations
3054                        param["annotation"]["snpeff"]["options"] = "".join(
3055                            annotation_file.split(":")[1:]
3056                        )
3057
3058                    # Annotation Annovar
3059                    elif annotation_file.startswith("annovar"):
3060
3061                        log.debug(f"Quick Annotation Annovar")
3062
3063                        if "annovar" not in param["annotation"]:
3064                            param["annotation"]["annovar"] = {}
3065
3066                        if "annotations" not in param["annotation"]["annovar"]:
3067                            param["annotation"]["annovar"]["annotations"] = {}
3068
3069                        # Options
3070                        annotation_file_split = annotation_file.split(":")
3071                        for annotation_file_annotation in annotation_file_split[1:]:
3072                            if annotation_file_annotation:
3073                                param["annotation"]["annovar"]["annotations"][
3074                                    annotation_file_annotation
3075                                ] = annotations
3076
3077                    # Annotation Exomiser
3078                    elif annotation_file.startswith("exomiser"):
3079
3080                        log.debug(f"Quick Annotation Exomiser")
3081
3082                        param["annotation"]["exomiser"] = params_string_to_dict(
3083                            annotation_file
3084                        )
3085
3086                    # Annotation Splice
3087                    elif annotation_file.startswith("splice"):
3088
3089                        log.debug(f"Quick Annotation Splice")
3090
3091                        param["annotation"]["splice"] = params_string_to_dict(
3092                            annotation_file
3093                        )
3094
3095                    # Annotation Parquet or BCFTOOLS
3096                    else:
3097
3098                        # Tools detection
3099                        if annotation_file.startswith("bcftools:"):
3100                            annotation_tool_initial = "bcftools"
3101                            annotation_file = ":".join(annotation_file.split(":")[1:])
3102                        elif annotation_file.startswith("snpsift:"):
3103                            annotation_tool_initial = "snpsift"
3104                            annotation_file = ":".join(annotation_file.split(":")[1:])
3105                        elif annotation_file.startswith("bigwig:"):
3106                            annotation_tool_initial = "bigwig"
3107                            annotation_file = ":".join(annotation_file.split(":")[1:])
3108                        else:
3109                            annotation_tool_initial = None
3110
3111                        # list of files
3112                        annotation_file_list = annotation_file.replace("+", ":").split(
3113                            ":"
3114                        )
3115
3116                        for annotation_file in annotation_file_list:
3117
3118                            if annotation_file:
3119
3120                                # Annotation tool initial
3121                                annotation_tool = annotation_tool_initial
3122
3123                                # Find file
3124                                annotation_file_found = None
3125
3126                                if os.path.exists(annotation_file):
3127                                    annotation_file_found = annotation_file
3128                                elif os.path.exists(full_path(annotation_file)):
3129                                    annotation_file_found = full_path(annotation_file)
3130                                else:
3131                                    # Find within assembly folders
3132                                    for annotations_database in annotations_databases:
3133                                        found_files = find_all(
3134                                            annotation_file,
3135                                            os.path.join(
3136                                                annotations_database, assembly
3137                                            ),
3138                                        )
3139                                        if len(found_files) > 0:
3140                                            annotation_file_found = found_files[0]
3141                                            break
3142                                    if not annotation_file_found and not assembly:
3143                                        # Find within folders
3144                                        for (
3145                                            annotations_database
3146                                        ) in annotations_databases:
3147                                            found_files = find_all(
3148                                                annotation_file, annotations_database
3149                                            )
3150                                            if len(found_files) > 0:
3151                                                annotation_file_found = found_files[0]
3152                                                break
3153                                log.debug(
3154                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3155                                )
3156
3157                                # Full path
3158                                annotation_file_found = full_path(annotation_file_found)
3159
3160                                if annotation_file_found:
3161
3162                                    database = Database(database=annotation_file_found)
3163                                    quick_annotation_format = database.get_format()
3164                                    quick_annotation_is_compressed = (
3165                                        database.is_compressed()
3166                                    )
3167                                    quick_annotation_is_indexed = os.path.exists(
3168                                        f"{annotation_file_found}.tbi"
3169                                    )
3170                                    bcftools_preference = False
3171
3172                                    # Check Annotation Tool
3173                                    if not annotation_tool:
3174                                        if (
3175                                            bcftools_preference
3176                                            and quick_annotation_format
3177                                            in ["vcf", "bed"]
3178                                            and quick_annotation_is_compressed
3179                                            and quick_annotation_is_indexed
3180                                        ):
3181                                            annotation_tool = "bcftools"
3182                                        elif quick_annotation_format in [
3183                                            "vcf",
3184                                            "bed",
3185                                            "tsv",
3186                                            "tsv",
3187                                            "csv",
3188                                            "json",
3189                                            "tbl",
3190                                            "parquet",
3191                                            "duckdb",
3192                                        ]:
3193                                            annotation_tool = "parquet"
3194                                        elif quick_annotation_format in ["bw"]:
3195                                            annotation_tool = "bigwig"
3196                                        else:
3197                                            log.error(
3198                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3199                                            )
3200                                            raise ValueError(
3201                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3202                                            )
3203
3204                                    log.debug(
3205                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3206                                    )
3207
3208                                    # Annotation Tool dispatch
3209                                    if annotation_tool:
3210                                        if annotation_tool not in param["annotation"]:
3211                                            param["annotation"][annotation_tool] = {}
3212                                        if (
3213                                            "annotations"
3214                                            not in param["annotation"][annotation_tool]
3215                                        ):
3216                                            param["annotation"][annotation_tool][
3217                                                "annotations"
3218                                            ] = {}
3219                                        param["annotation"][annotation_tool][
3220                                            "annotations"
3221                                        ][annotation_file_found] = annotations
3222
3223                                else:
3224                                    log.warning(
3225                                        f"Quick Annotation File {annotation_file} does NOT exist"
3226                                    )
3227
3228                self.set_param(param)
3229
3230        if param.get("annotation", None):
3231            log.info("Annotations")
3232            if param.get("annotation", {}).get("parquet", None):
3233                log.info("Annotations 'parquet'...")
3234                self.annotation_parquet()
3235            if param.get("annotation", {}).get("bcftools", None):
3236                log.info("Annotations 'bcftools'...")
3237                self.annotation_bcftools()
3238            if param.get("annotation", {}).get("snpsift", None):
3239                log.info("Annotations 'snpsift'...")
3240                self.annotation_snpsift()
3241            if param.get("annotation", {}).get("bigwig", None):
3242                log.info("Annotations 'bigwig'...")
3243                self.annotation_bigwig()
3244            if param.get("annotation", {}).get("annovar", None):
3245                log.info("Annotations 'annovar'...")
3246                self.annotation_annovar()
3247            if param.get("annotation", {}).get("snpeff", None):
3248                log.info("Annotations 'snpeff'...")
3249                self.annotation_snpeff()
3250            if param.get("annotation", {}).get("exomiser", None) is not None:
3251                log.info("Annotations 'exomiser'...")
3252                self.annotation_exomiser()
3253            if param.get("annotation", {}).get("splice", None) is not None:
3254                log.info("Annotations 'splice' ...")
3255                self.annotation_splice()
3256
3257        # Explode INFOS fields into table fields
3258        if self.get_explode_infos():
3259            self.explode_infos(
3260                prefix=self.get_explode_infos_prefix(),
3261                fields=self.get_explode_infos_fields(),
3262                force=True,
3263            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3265    def annotation_bigwig(self, threads: int = None) -> None:
3266        """
3267        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3268
3269        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3270        number of threads to be used for parallel processing during the annotation process. If the
3271        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3272        threads to use based on the system configuration
3273        :type threads: int
3274        :return: True
3275        """
3276
3277        # DEBUG
3278        log.debug("Start annotation with bigwig databases")
3279
3280        # # Threads
3281        # if not threads:
3282        #     threads = self.get_threads()
3283        # log.debug("Threads: " + str(threads))
3284
3285        # Config
3286        config = self.get_config()
3287        log.debug("Config: " + str(config))
3288
3289        # Config - BCFTools databases folders
3290        databases_folders = set(
3291            self.get_config()
3292            .get("folders", {})
3293            .get("databases", {})
3294            .get("annotations", ["."])
3295            + self.get_config()
3296            .get("folders", {})
3297            .get("databases", {})
3298            .get("bigwig", ["."])
3299        )
3300        log.debug("Databases annotations: " + str(databases_folders))
3301
3302        # Param
3303        annotations = (
3304            self.get_param()
3305            .get("annotation", {})
3306            .get("bigwig", {})
3307            .get("annotations", None)
3308        )
3309        log.debug("Annotations: " + str(annotations))
3310
3311        # Assembly
3312        assembly = self.get_param().get(
3313            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3314        )
3315
3316        # Data
3317        table_variants = self.get_table_variants()
3318
3319        # Check if not empty
3320        log.debug("Check if not empty")
3321        sql_query_chromosomes = (
3322            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3323        )
3324        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3325        if not sql_query_chromosomes_df["count"][0]:
3326            log.info(f"VCF empty")
3327            return
3328
3329        # VCF header
3330        vcf_reader = self.get_header()
3331        log.debug("Initial header: " + str(vcf_reader.infos))
3332
3333        # Existing annotations
3334        for vcf_annotation in self.get_header().infos:
3335
3336            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3337            log.debug(
3338                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3339            )
3340
3341        if annotations:
3342
3343            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3344
3345                # Export VCF file
3346                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3347
3348                # annotation_bigwig_config
3349                annotation_bigwig_config_list = []
3350
3351                for annotation in annotations:
3352                    annotation_fields = annotations[annotation]
3353
3354                    # Annotation Name
3355                    annotation_name = os.path.basename(annotation)
3356
3357                    if not annotation_fields:
3358                        annotation_fields = {"INFO": None}
3359
3360                    log.debug(f"Annotation '{annotation_name}'")
3361                    log.debug(
3362                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3363                    )
3364
3365                    # Create Database
3366                    database = Database(
3367                        database=annotation,
3368                        databases_folders=databases_folders,
3369                        assembly=assembly,
3370                    )
3371
3372                    # Find files
3373                    db_file = database.get_database()
3374                    db_file = full_path(db_file)
3375                    db_hdr_file = database.get_header_file()
3376                    db_hdr_file = full_path(db_hdr_file)
3377                    db_file_type = database.get_format()
3378
3379                    # If db_file is http ?
3380                    if database.get_database().startswith("http"):
3381
3382                        # Datbase is HTTP URL
3383                        db_file_is_http = True
3384
3385                        # DB file keep as URL
3386                        db_file = database.get_database()
3387                        log.warning(
3388                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
3389                        )
3390
3391                        # Retrieve automatic annotation field name
3392                        annotation_field = clean_annotation_field(
3393                            os.path.basename(db_file).replace(".bw", "")
3394                        )
3395                        log.debug(
3396                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
3397                        )
3398
3399                        # Create automatic header file
3400                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3401                        with open(db_hdr_file, "w") as f:
3402                            f.write("##fileformat=VCFv4.2\n")
3403                            f.write(
3404                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
3405                            )
3406                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3407
3408                    else:
3409
3410                        # Datbase is NOT HTTP URL
3411                        db_file_is_http = False
3412
3413                    # Check index - try to create if not exists
3414                    if (
3415                        db_file is None
3416                        or db_hdr_file is None
3417                        or (not os.path.exists(db_file) and not db_file_is_http)
3418                        or not os.path.exists(db_hdr_file)
3419                        or not db_file_type in ["bw"]
3420                    ):
3421                        # if False:
3422                        log.error("Annotation failed: database not valid")
3423                        log.error(f"Annotation annotation file: {db_file}")
3424                        log.error(f"Annotation annotation file type: {db_file_type}")
3425                        log.error(f"Annotation annotation header: {db_hdr_file}")
3426                        raise ValueError(
3427                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3428                        )
3429                    else:
3430
3431                        # Log
3432                        log.debug(
3433                            f"Annotation '{annotation}' - file: "
3434                            + str(db_file)
3435                            + " and "
3436                            + str(db_hdr_file)
3437                        )
3438
3439                        # Load header as VCF object
3440                        db_hdr_vcf = Variants(input=db_hdr_file)
3441                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3442                        log.debug(
3443                            "Annotation database header: "
3444                            + str(db_hdr_vcf_header_infos)
3445                        )
3446
3447                        # For all fields in database
3448                        annotation_fields_full = False
3449                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3450                            annotation_fields = {
3451                                key: key for key in db_hdr_vcf_header_infos
3452                            }
3453                            log.debug(
3454                                "Annotation database header - All annotations added: "
3455                                + str(annotation_fields)
3456                            )
3457                            annotation_fields_full = True
3458
3459                        # Init
3460                        cyvcf2_header_rename_dict = {}
3461                        cyvcf2_header_list = []
3462                        cyvcf2_header_indexes = {}
3463
3464                        # process annotation fields
3465                        for annotation_field in annotation_fields:
3466
3467                            # New annotation name
3468                            annotation_field_new = annotation_fields[annotation_field]
3469
3470                            # Check annotation field and index in header
3471                            if (
3472                                annotation_field
3473                                in db_hdr_vcf.get_header_columns_as_list()
3474                            ):
3475                                annotation_field_index = (
3476                                    db_hdr_vcf.get_header_columns_as_list().index(
3477                                        annotation_field
3478                                    )
3479                                    - 3
3480                                )
3481                                cyvcf2_header_indexes[annotation_field_new] = (
3482                                    annotation_field_index
3483                                )
3484                            else:
3485                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3486                                log.error(msg_err)
3487                                raise ValueError(msg_err)
3488
3489                            # Append annotation field in cyvcf2 header list
3490                            cyvcf2_header_rename_dict[annotation_field_new] = (
3491                                db_hdr_vcf_header_infos[annotation_field].id
3492                            )
3493                            cyvcf2_header_list.append(
3494                                {
3495                                    "ID": annotation_field_new,
3496                                    "Number": db_hdr_vcf_header_infos[
3497                                        annotation_field
3498                                    ].num,
3499                                    "Type": db_hdr_vcf_header_infos[
3500                                        annotation_field
3501                                    ].type,
3502                                    "Description": db_hdr_vcf_header_infos[
3503                                        annotation_field
3504                                    ].desc,
3505                                }
3506                            )
3507
3508                            # Add header on VCF
3509                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
3510                                annotation_field_new,
3511                                db_hdr_vcf_header_infos[annotation_field].num,
3512                                db_hdr_vcf_header_infos[annotation_field].type,
3513                                db_hdr_vcf_header_infos[annotation_field].desc,
3514                                "HOWARD BigWig annotation",
3515                                "unknown",
3516                                self.code_type_map[
3517                                    db_hdr_vcf_header_infos[annotation_field].type
3518                                ],
3519                            )
3520
3521                        # Load bigwig database
3522                        bw_db = pyBigWig.open(db_file)
3523                        if bw_db.isBigWig():
3524                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3525                        else:
3526                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3527                            log.error(msg_err)
3528                            raise ValueError(msg_err)
3529
3530                        annotation_bigwig_config_list.append(
3531                            {
3532                                "db_file": db_file,
3533                                "bw_db": bw_db,
3534                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3535                                "cyvcf2_header_list": cyvcf2_header_list,
3536                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
3537                            }
3538                        )
3539
3540                # Annotate
3541                if annotation_bigwig_config_list:
3542
3543                    # Annotation config
3544                    log.debug(
3545                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
3546                    )
3547
3548                    # Export VCF file
3549                    self.export_variant_vcf(
3550                        vcf_file=tmp_vcf_name,
3551                        remove_info=True,
3552                        add_samples=False,
3553                        index=True,
3554                    )
3555
3556                    # Load input tmp file
3557                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3558
3559                    # Add header in input file
3560                    for annotation_bigwig_config in annotation_bigwig_config_list:
3561                        for cyvcf2_header_field in annotation_bigwig_config.get(
3562                            "cyvcf2_header_list", []
3563                        ):
3564                            log.info(
3565                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
3566                            )
3567                            input_vcf.add_info_to_header(cyvcf2_header_field)
3568
3569                    # Create output VCF file
3570                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
3571                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3572
3573                    # Fetch variants
3574                    log.info(f"Annotations 'bigwig' start...")
3575                    for variant in input_vcf:
3576
3577                        for annotation_bigwig_config in annotation_bigwig_config_list:
3578
3579                            # DB and indexes
3580                            bw_db = annotation_bigwig_config.get("bw_db", None)
3581                            cyvcf2_header_indexes = annotation_bigwig_config.get(
3582                                "cyvcf2_header_indexes", None
3583                            )
3584
3585                            # Retrieve value from chrom pos
3586                            res = bw_db.values(
3587                                variant.CHROM, variant.POS - 1, variant.POS
3588                            )
3589
3590                            # For each annotation fields (and indexes)
3591                            for cyvcf2_header_index in cyvcf2_header_indexes:
3592
3593                                # If value is NOT nNone
3594                                if not np.isnan(
3595                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
3596                                ):
3597                                    variant.INFO[cyvcf2_header_index] = res[
3598                                        cyvcf2_header_indexes[cyvcf2_header_index]
3599                                    ]
3600
3601                        # Add record in output file
3602                        output_vcf.write_record(variant)
3603
3604                    # Log
3605                    log.debug(f"Annotation done.")
3606
3607                    # Close and write file
3608                    log.info(f"Annotations 'bigwig' write...")
3609                    output_vcf.close()
3610                    log.debug(f"Write done.")
3611
3612                    # Update variants
3613                    log.info(f"Annotations 'bigwig' update...")
3614                    self.update_from_vcf(output_vcf_file)
3615                    log.debug(f"Update done.")
3616
3617        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3619    def annotation_snpsift(self, threads: int = None) -> None:
3620        """
3621        This function annotate with bcftools
3622
3623        :param threads: Number of threads to use
3624        :return: the value of the variable "return_value".
3625        """
3626
3627        # DEBUG
3628        log.debug("Start annotation with bcftools databases")
3629
3630        # Threads
3631        if not threads:
3632            threads = self.get_threads()
3633        log.debug("Threads: " + str(threads))
3634
3635        # Config
3636        config = self.get_config()
3637        log.debug("Config: " + str(config))
3638
3639        # Config - snpSift
3640        snpsift_bin_command = get_bin_command(
3641            bin="SnpSift.jar",
3642            tool="snpsift",
3643            bin_type="jar",
3644            config=config,
3645            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3646        )
3647        if not snpsift_bin_command:
3648            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3649            log.error(msg_err)
3650            raise ValueError(msg_err)
3651
3652        # Config - bcftools
3653        bcftools_bin_command = get_bin_command(
3654            bin="bcftools",
3655            tool="bcftools",
3656            bin_type="bin",
3657            config=config,
3658            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3659        )
3660        if not bcftools_bin_command:
3661            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3662            log.error(msg_err)
3663            raise ValueError(msg_err)
3664
3665        # Config - BCFTools databases folders
3666        databases_folders = set(
3667            self.get_config()
3668            .get("folders", {})
3669            .get("databases", {})
3670            .get("annotations", ["."])
3671            + self.get_config()
3672            .get("folders", {})
3673            .get("databases", {})
3674            .get("bcftools", ["."])
3675        )
3676        log.debug("Databases annotations: " + str(databases_folders))
3677
3678        # Param
3679        annotations = (
3680            self.get_param()
3681            .get("annotation", {})
3682            .get("snpsift", {})
3683            .get("annotations", None)
3684        )
3685        log.debug("Annotations: " + str(annotations))
3686
3687        # Assembly
3688        assembly = self.get_param().get(
3689            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3690        )
3691
3692        # Data
3693        table_variants = self.get_table_variants()
3694
3695        # Check if not empty
3696        log.debug("Check if not empty")
3697        sql_query_chromosomes = (
3698            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3699        )
3700        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3701        if not sql_query_chromosomes_df["count"][0]:
3702            log.info(f"VCF empty")
3703            return
3704
3705        # VCF header
3706        vcf_reader = self.get_header()
3707        log.debug("Initial header: " + str(vcf_reader.infos))
3708
3709        # Existing annotations
3710        for vcf_annotation in self.get_header().infos:
3711
3712            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3713            log.debug(
3714                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3715            )
3716
3717        if annotations:
3718
3719            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3720
3721                # Export VCF file
3722                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3723
3724                # Init
3725                commands = {}
3726
3727                for annotation in annotations:
3728                    annotation_fields = annotations[annotation]
3729
3730                    # Annotation Name
3731                    annotation_name = os.path.basename(annotation)
3732
3733                    if not annotation_fields:
3734                        annotation_fields = {"INFO": None}
3735
3736                    log.debug(f"Annotation '{annotation_name}'")
3737                    log.debug(
3738                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3739                    )
3740
3741                    # Create Database
3742                    database = Database(
3743                        database=annotation,
3744                        databases_folders=databases_folders,
3745                        assembly=assembly,
3746                    )
3747
3748                    # Find files
3749                    db_file = database.get_database()
3750                    db_file = full_path(db_file)
3751                    db_hdr_file = database.get_header_file()
3752                    db_hdr_file = full_path(db_hdr_file)
3753                    db_file_type = database.get_format()
3754                    db_tbi_file = f"{db_file}.tbi"
3755                    db_file_compressed = database.is_compressed()
3756
3757                    # Check if compressed
3758                    if not db_file_compressed:
3759                        log.error(
3760                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3761                        )
3762                        raise ValueError(
3763                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3764                        )
3765
3766                    # Check if indexed
3767                    if not os.path.exists(db_tbi_file):
3768                        log.error(
3769                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3770                        )
3771                        raise ValueError(
3772                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3773                        )
3774
3775                    # Check index - try to create if not exists
3776                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3777                        log.error("Annotation failed: database not valid")
3778                        log.error(f"Annotation annotation file: {db_file}")
3779                        log.error(f"Annotation annotation header: {db_hdr_file}")
3780                        log.error(f"Annotation annotation index: {db_tbi_file}")
3781                        raise ValueError(
3782                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3783                        )
3784                    else:
3785
3786                        log.debug(
3787                            f"Annotation '{annotation}' - file: "
3788                            + str(db_file)
3789                            + " and "
3790                            + str(db_hdr_file)
3791                        )
3792
3793                        # Load header as VCF object
3794                        db_hdr_vcf = Variants(input=db_hdr_file)
3795                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3796                        log.debug(
3797                            "Annotation database header: "
3798                            + str(db_hdr_vcf_header_infos)
3799                        )
3800
3801                        # For all fields in database
3802                        annotation_fields_full = False
3803                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3804                            annotation_fields = {
3805                                key: key for key in db_hdr_vcf_header_infos
3806                            }
3807                            log.debug(
3808                                "Annotation database header - All annotations added: "
3809                                + str(annotation_fields)
3810                            )
3811                            annotation_fields_full = True
3812
3813                        # # Create file for field rename
3814                        # log.debug("Create file for field rename")
3815                        # tmp_rename = NamedTemporaryFile(
3816                        #     prefix=self.get_prefix(),
3817                        #     dir=self.get_tmp_dir(),
3818                        #     suffix=".rename",
3819                        #     delete=False,
3820                        # )
3821                        # tmp_rename_name = tmp_rename.name
3822                        # tmp_files.append(tmp_rename_name)
3823
3824                        # Number of fields
3825                        nb_annotation_field = 0
3826                        annotation_list = []
3827                        annotation_infos_rename_list = []
3828
3829                        for annotation_field in annotation_fields:
3830
3831                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3832                            annotation_fields_new_name = annotation_fields.get(
3833                                annotation_field, annotation_field
3834                            )
3835                            if not annotation_fields_new_name:
3836                                annotation_fields_new_name = annotation_field
3837
3838                            # Check if field is in DB and if field is not elready in input data
3839                            if (
3840                                annotation_field in db_hdr_vcf.get_header().infos
3841                                and annotation_fields_new_name
3842                                not in self.get_header().infos
3843                            ):
3844
3845                                log.info(
3846                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3847                                )
3848
3849                                # BCFTools annotate param to rename fields
3850                                if annotation_field != annotation_fields_new_name:
3851                                    annotation_infos_rename_list.append(
3852                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3853                                    )
3854
3855                                # Add INFO field to header
3856                                db_hdr_vcf_header_infos_number = (
3857                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3858                                )
3859                                db_hdr_vcf_header_infos_type = (
3860                                    db_hdr_vcf_header_infos[annotation_field].type
3861                                    or "String"
3862                                )
3863                                db_hdr_vcf_header_infos_description = (
3864                                    db_hdr_vcf_header_infos[annotation_field].desc
3865                                    or f"{annotation_field} description"
3866                                )
3867                                db_hdr_vcf_header_infos_source = (
3868                                    db_hdr_vcf_header_infos[annotation_field].source
3869                                    or "unknown"
3870                                )
3871                                db_hdr_vcf_header_infos_version = (
3872                                    db_hdr_vcf_header_infos[annotation_field].version
3873                                    or "unknown"
3874                                )
3875
3876                                vcf_reader.infos[annotation_fields_new_name] = (
3877                                    vcf.parser._Info(
3878                                        annotation_fields_new_name,
3879                                        db_hdr_vcf_header_infos_number,
3880                                        db_hdr_vcf_header_infos_type,
3881                                        db_hdr_vcf_header_infos_description,
3882                                        db_hdr_vcf_header_infos_source,
3883                                        db_hdr_vcf_header_infos_version,
3884                                        self.code_type_map[
3885                                            db_hdr_vcf_header_infos_type
3886                                        ],
3887                                    )
3888                                )
3889
3890                                annotation_list.append(annotation_field)
3891
3892                                nb_annotation_field += 1
3893
3894                            else:
3895
3896                                if (
3897                                    annotation_field
3898                                    not in db_hdr_vcf.get_header().infos
3899                                ):
3900                                    log.warning(
3901                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3902                                    )
3903                                if (
3904                                    annotation_fields_new_name
3905                                    in self.get_header().infos
3906                                ):
3907                                    log.warning(
3908                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3909                                    )
3910
3911                        log.info(
3912                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3913                        )
3914
3915                        annotation_infos = ",".join(annotation_list)
3916
3917                        if annotation_infos != "":
3918
3919                            # Annotated VCF (and error file)
3920                            tmp_annotation_vcf_name = os.path.join(
3921                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3922                            )
3923                            tmp_annotation_vcf_name_err = (
3924                                tmp_annotation_vcf_name + ".err"
3925                            )
3926
3927                            # Add fields to annotate
3928                            if not annotation_fields_full:
3929                                annotation_infos_option = f"-info {annotation_infos}"
3930                            else:
3931                                annotation_infos_option = ""
3932
3933                            # Info fields rename
3934                            if annotation_infos_rename_list:
3935                                annotation_infos_rename = " -c " + ",".join(
3936                                    annotation_infos_rename_list
3937                                )
3938                            else:
3939                                annotation_infos_rename = ""
3940
3941                            # Annotate command
3942                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3943
3944                            # Add command
3945                            commands[command_annotate] = tmp_annotation_vcf_name
3946
3947                if commands:
3948
3949                    # Export VCF file
3950                    self.export_variant_vcf(
3951                        vcf_file=tmp_vcf_name,
3952                        remove_info=True,
3953                        add_samples=False,
3954                        index=True,
3955                    )
3956                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3957
3958                    # Num command
3959                    nb_command = 0
3960
3961                    # Annotate
3962                    for command_annotate in commands:
3963                        nb_command += 1
3964                        log.info(
3965                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3966                        )
3967                        log.debug(f"command_annotate={command_annotate}")
3968                        run_parallel_commands([command_annotate], threads)
3969
3970                        # Debug
3971                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3972
3973                        # Update variants
3974                        log.info(
3975                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3976                        )
3977                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3979    def annotation_bcftools(self, threads: int = None) -> None:
3980        """
3981        This function annotate with bcftools
3982
3983        :param threads: Number of threads to use
3984        :return: the value of the variable "return_value".
3985        """
3986
3987        # DEBUG
3988        log.debug("Start annotation with bcftools databases")
3989
3990        # Threads
3991        if not threads:
3992            threads = self.get_threads()
3993        log.debug("Threads: " + str(threads))
3994
3995        # Config
3996        config = self.get_config()
3997        log.debug("Config: " + str(config))
3998
3999        # DEBUG
4000        delete_tmp = True
4001        if self.get_config().get("verbosity", "warning") in ["debug"]:
4002            delete_tmp = False
4003            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4004
4005        # Config - BCFTools bin command
4006        bcftools_bin_command = get_bin_command(
4007            bin="bcftools",
4008            tool="bcftools",
4009            bin_type="bin",
4010            config=config,
4011            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4012        )
4013        if not bcftools_bin_command:
4014            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4015            log.error(msg_err)
4016            raise ValueError(msg_err)
4017
4018        # Config - BCFTools databases folders
4019        databases_folders = set(
4020            self.get_config()
4021            .get("folders", {})
4022            .get("databases", {})
4023            .get("annotations", ["."])
4024            + self.get_config()
4025            .get("folders", {})
4026            .get("databases", {})
4027            .get("bcftools", ["."])
4028        )
4029        log.debug("Databases annotations: " + str(databases_folders))
4030
4031        # Param
4032        annotations = (
4033            self.get_param()
4034            .get("annotation", {})
4035            .get("bcftools", {})
4036            .get("annotations", None)
4037        )
4038        log.debug("Annotations: " + str(annotations))
4039
4040        # Assembly
4041        assembly = self.get_param().get(
4042            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
4043        )
4044
4045        # Data
4046        table_variants = self.get_table_variants()
4047
4048        # Check if not empty
4049        log.debug("Check if not empty")
4050        sql_query_chromosomes = (
4051            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4052        )
4053        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4054        if not sql_query_chromosomes_df["count"][0]:
4055            log.info(f"VCF empty")
4056            return
4057
4058        # Export in VCF
4059        log.debug("Create initial file to annotate")
4060        tmp_vcf = NamedTemporaryFile(
4061            prefix=self.get_prefix(),
4062            dir=self.get_tmp_dir(),
4063            suffix=".vcf.gz",
4064            delete=False,
4065        )
4066        tmp_vcf_name = tmp_vcf.name
4067
4068        # VCF header
4069        vcf_reader = self.get_header()
4070        log.debug("Initial header: " + str(vcf_reader.infos))
4071
4072        # Existing annotations
4073        for vcf_annotation in self.get_header().infos:
4074
4075            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4076            log.debug(
4077                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4078            )
4079
4080        if annotations:
4081
4082            tmp_ann_vcf_list = []
4083            commands = []
4084            tmp_files = []
4085            err_files = []
4086
4087            for annotation in annotations:
4088                annotation_fields = annotations[annotation]
4089
4090                # Annotation Name
4091                annotation_name = os.path.basename(annotation)
4092
4093                if not annotation_fields:
4094                    annotation_fields = {"INFO": None}
4095
4096                log.debug(f"Annotation '{annotation_name}'")
4097                log.debug(
4098                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4099                )
4100
4101                # Create Database
4102                database = Database(
4103                    database=annotation,
4104                    databases_folders=databases_folders,
4105                    assembly=assembly,
4106                )
4107
4108                # Find files
4109                db_file = database.get_database()
4110                db_file = full_path(db_file)
4111                db_hdr_file = database.get_header_file()
4112                db_hdr_file = full_path(db_hdr_file)
4113                db_file_type = database.get_format()
4114                db_tbi_file = f"{db_file}.tbi"
4115                db_file_compressed = database.is_compressed()
4116
4117                # Check if compressed
4118                if not db_file_compressed:
4119                    log.error(
4120                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4121                    )
4122                    raise ValueError(
4123                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4124                    )
4125
4126                # Check if indexed
4127                if not os.path.exists(db_tbi_file):
4128                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4129                    raise ValueError(
4130                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4131                    )
4132
4133                # Check index - try to create if not exists
4134                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4135                    log.error("Annotation failed: database not valid")
4136                    log.error(f"Annotation annotation file: {db_file}")
4137                    log.error(f"Annotation annotation header: {db_hdr_file}")
4138                    log.error(f"Annotation annotation index: {db_tbi_file}")
4139                    raise ValueError(
4140                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4141                    )
4142                else:
4143
4144                    log.debug(
4145                        f"Annotation '{annotation}' - file: "
4146                        + str(db_file)
4147                        + " and "
4148                        + str(db_hdr_file)
4149                    )
4150
4151                    # Load header as VCF object
4152                    db_hdr_vcf = Variants(input=db_hdr_file)
4153                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4154                    log.debug(
4155                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4156                    )
4157
4158                    # For all fields in database
4159                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4160                        annotation_fields = {
4161                            key: key for key in db_hdr_vcf_header_infos
4162                        }
4163                        log.debug(
4164                            "Annotation database header - All annotations added: "
4165                            + str(annotation_fields)
4166                        )
4167
4168                    # Number of fields
4169                    nb_annotation_field = 0
4170                    annotation_list = []
4171
4172                    for annotation_field in annotation_fields:
4173
4174                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4175                        annotation_fields_new_name = annotation_fields.get(
4176                            annotation_field, annotation_field
4177                        )
4178                        if not annotation_fields_new_name:
4179                            annotation_fields_new_name = annotation_field
4180
4181                        # Check if field is in DB and if field is not elready in input data
4182                        if (
4183                            annotation_field in db_hdr_vcf.get_header().infos
4184                            and annotation_fields_new_name
4185                            not in self.get_header().infos
4186                        ):
4187
4188                            log.info(
4189                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4190                            )
4191
4192                            # Add INFO field to header
4193                            db_hdr_vcf_header_infos_number = (
4194                                db_hdr_vcf_header_infos[annotation_field].num or "."
4195                            )
4196                            db_hdr_vcf_header_infos_type = (
4197                                db_hdr_vcf_header_infos[annotation_field].type
4198                                or "String"
4199                            )
4200                            db_hdr_vcf_header_infos_description = (
4201                                db_hdr_vcf_header_infos[annotation_field].desc
4202                                or f"{annotation_field} description"
4203                            )
4204                            db_hdr_vcf_header_infos_source = (
4205                                db_hdr_vcf_header_infos[annotation_field].source
4206                                or "unknown"
4207                            )
4208                            db_hdr_vcf_header_infos_version = (
4209                                db_hdr_vcf_header_infos[annotation_field].version
4210                                or "unknown"
4211                            )
4212
4213                            vcf_reader.infos[annotation_fields_new_name] = (
4214                                vcf.parser._Info(
4215                                    annotation_fields_new_name,
4216                                    db_hdr_vcf_header_infos_number,
4217                                    db_hdr_vcf_header_infos_type,
4218                                    db_hdr_vcf_header_infos_description,
4219                                    db_hdr_vcf_header_infos_source,
4220                                    db_hdr_vcf_header_infos_version,
4221                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4222                                )
4223                            )
4224
4225                            # annotation_list.append(annotation_field)
4226                            if annotation_field != annotation_fields_new_name:
4227                                annotation_list.append(
4228                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4229                                )
4230                            else:
4231                                annotation_list.append(annotation_field)
4232
4233                            nb_annotation_field += 1
4234
4235                        else:
4236
4237                            if annotation_field not in db_hdr_vcf.get_header().infos:
4238                                log.warning(
4239                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4240                                )
4241                            if annotation_fields_new_name in self.get_header().infos:
4242                                log.warning(
4243                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4244                                )
4245
4246                    log.info(
4247                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4248                    )
4249
4250                    annotation_infos = ",".join(annotation_list)
4251
4252                    if annotation_infos != "":
4253
4254                        # Protect header for bcftools (remove "#CHROM" and variants line)
4255                        log.debug("Protect Header file - remove #CHROM line if exists")
4256                        tmp_header_vcf = NamedTemporaryFile(
4257                            prefix=self.get_prefix(),
4258                            dir=self.get_tmp_dir(),
4259                            suffix=".hdr",
4260                            delete=False,
4261                        )
4262                        tmp_header_vcf_name = tmp_header_vcf.name
4263                        tmp_files.append(tmp_header_vcf_name)
4264                        # Command
4265                        if db_hdr_file.endswith(".gz"):
4266                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4267                        else:
4268                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4269                        # Run
4270                        run_parallel_commands([command_extract_header], 1)
4271
4272                        # Find chomosomes
4273                        log.debug("Find chromosomes ")
4274                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4275                        sql_query_chromosomes_df = self.get_query_to_df(
4276                            sql_query_chromosomes
4277                        )
4278                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4279
4280                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4281
4282                        # BED columns in the annotation file
4283                        if db_file_type in ["bed"]:
4284                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4285
4286                        for chrom in chomosomes_list:
4287
4288                            # Create BED on initial VCF
4289                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4290                            tmp_bed = NamedTemporaryFile(
4291                                prefix=self.get_prefix(),
4292                                dir=self.get_tmp_dir(),
4293                                suffix=".bed",
4294                                delete=False,
4295                            )
4296                            tmp_bed_name = tmp_bed.name
4297                            tmp_files.append(tmp_bed_name)
4298
4299                            # Detecte regions
4300                            log.debug(
4301                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4302                            )
4303                            window = 1000000
4304                            sql_query_intervals_for_bed = f"""
4305                                SELECT  \"#CHROM\",
4306                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4307                                        \"POS\"+{window}
4308                                FROM {table_variants} as table_variants
4309                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4310                            """
4311                            regions = self.conn.execute(
4312                                sql_query_intervals_for_bed
4313                            ).fetchall()
4314                            merged_regions = merge_regions(regions)
4315                            log.debug(
4316                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4317                            )
4318
4319                            header = ["#CHROM", "START", "END"]
4320                            with open(tmp_bed_name, "w") as f:
4321                                # Write the header with tab delimiter
4322                                f.write("\t".join(header) + "\n")
4323                                for d in merged_regions:
4324                                    # Write each data row with tab delimiter
4325                                    f.write("\t".join(map(str, d)) + "\n")
4326
4327                            # Tmp files
4328                            tmp_annotation_vcf = NamedTemporaryFile(
4329                                prefix=self.get_prefix(),
4330                                dir=self.get_tmp_dir(),
4331                                suffix=".vcf.gz",
4332                                delete=False,
4333                            )
4334                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4335                            tmp_files.append(tmp_annotation_vcf_name)
4336                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4337                            tmp_annotation_vcf_name_err = (
4338                                tmp_annotation_vcf_name + ".err"
4339                            )
4340                            err_files.append(tmp_annotation_vcf_name_err)
4341
4342                            # Annotate Command
4343                            log.debug(
4344                                f"Annotation '{annotation}' - add bcftools command"
4345                            )
4346
4347                            # Command
4348                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4349
4350                            # Add command
4351                            commands.append(command_annotate)
4352
4353            # if some commands
4354            if commands:
4355
4356                # Export VCF file
4357                self.export_variant_vcf(
4358                    vcf_file=tmp_vcf_name,
4359                    remove_info=True,
4360                    add_samples=False,
4361                    index=True,
4362                )
4363
4364                # Threads
4365                # calculate threads for annotated commands
4366                if commands:
4367                    threads_bcftools_annotate = round(threads / len(commands))
4368                else:
4369                    threads_bcftools_annotate = 1
4370
4371                if not threads_bcftools_annotate:
4372                    threads_bcftools_annotate = 1
4373
4374                # Add threads option to bcftools commands
4375                if threads_bcftools_annotate > 1:
4376                    commands_threaded = []
4377                    for command in commands:
4378                        commands_threaded.append(
4379                            command.replace(
4380                                f"{bcftools_bin_command} annotate ",
4381                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4382                            )
4383                        )
4384                    commands = commands_threaded
4385
4386                # Command annotation multithreading
4387                log.debug(f"Annotation - Annotation commands: " + str(commands))
4388                log.info(
4389                    f"Annotation - Annotation multithreaded in "
4390                    + str(len(commands))
4391                    + " commands"
4392                )
4393
4394                run_parallel_commands(commands, threads)
4395
4396                # Merge
4397                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4398
4399                if tmp_ann_vcf_list_cmd:
4400
4401                    # Tmp file
4402                    tmp_annotate_vcf = NamedTemporaryFile(
4403                        prefix=self.get_prefix(),
4404                        dir=self.get_tmp_dir(),
4405                        suffix=".vcf.gz",
4406                        delete=True,
4407                    )
4408                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4409                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4410                    err_files.append(tmp_annotate_vcf_name_err)
4411
4412                    # Tmp file remove command
4413                    tmp_files_remove_command = ""
4414                    if tmp_files:
4415                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4416
4417                    # Command merge
4418                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4419                    log.info(
4420                        f"Annotation - Annotation merging "
4421                        + str(len(commands))
4422                        + " annotated files"
4423                    )
4424                    log.debug(f"Annotation - merge command: {merge_command}")
4425                    run_parallel_commands([merge_command], 1)
4426
4427                    # Error messages
4428                    log.info(f"Error/Warning messages:")
4429                    error_message_command_all = []
4430                    error_message_command_warning = []
4431                    error_message_command_err = []
4432                    for err_file in err_files:
4433                        with open(err_file, "r") as f:
4434                            for line in f:
4435                                message = line.strip()
4436                                error_message_command_all.append(message)
4437                                if line.startswith("[W::"):
4438                                    error_message_command_warning.append(message)
4439                                if line.startswith("[E::"):
4440                                    error_message_command_err.append(
4441                                        f"{err_file}: " + message
4442                                    )
4443                    # log info
4444                    for message in list(
4445                        set(error_message_command_err + error_message_command_warning)
4446                    ):
4447                        log.info(f"   {message}")
4448                    # debug info
4449                    for message in list(set(error_message_command_all)):
4450                        log.debug(f"   {message}")
4451                    # failed
4452                    if len(error_message_command_err):
4453                        log.error("Annotation failed: Error in commands")
4454                        raise ValueError("Annotation failed: Error in commands")
4455
4456                    # Update variants
4457                    log.info(f"Annotation - Updating...")
4458                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4460    def annotation_exomiser(self, threads: int = None) -> None:
4461        """
4462        This function annotate with Exomiser
4463
4464        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4465        - "analysis" (dict/file):
4466            Full analysis dictionnary parameters (see Exomiser docs).
4467            Either a dict, or a file in JSON or YAML format.
4468            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4469            Default : None
4470        - "preset" (string):
4471            Analysis preset (available in config folder).
4472            Used if no full "analysis" is provided.
4473            Default: "exome"
4474        - "phenopacket" (dict/file):
4475            Samples and phenotipic features parameters (see Exomiser docs).
4476            Either a dict, or a file in JSON or YAML format.
4477            Default: None
4478        - "subject" (dict):
4479            Sample parameters (see Exomiser docs).
4480            Example:
4481                "subject":
4482                    {
4483                        "id": "ISDBM322017",
4484                        "sex": "FEMALE"
4485                    }
4486            Default: None
4487        - "sample" (string):
4488            Sample name to construct "subject" section:
4489                "subject":
4490                    {
4491                        "id": "<sample>",
4492                        "sex": "UNKNOWN_SEX"
4493                    }
4494            Default: None
4495        - "phenotypicFeatures" (dict)
4496            Phenotypic features to construct "subject" section.
4497            Example:
4498                "phenotypicFeatures":
4499                    [
4500                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4501                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4502                    ]
4503        - "hpo" (list)
4504            List of HPO ids as phenotypic features.
4505            Example:
4506                "hpo": ['0001156', '0001363', '0011304', '0010055']
4507            Default: []
4508        - "outputOptions" (dict):
4509            Output options (see Exomiser docs).
4510            Default:
4511                "output_options" =
4512                    {
4513                        "outputContributingVariantsOnly": False,
4514                        "numGenes": 0,
4515                        "outputFormats": ["TSV_VARIANT", "VCF"]
4516                    }
4517        - "transcript_source" (string):
4518            Transcript source (either "refseq", "ucsc", "ensembl")
4519            Default: "refseq"
4520        - "exomiser_to_info" (boolean):
4521            Add exomiser TSV file columns as INFO fields in VCF.
4522            Default: False
4523        - "release" (string):
4524            Exomise database release.
4525            If not exists, database release will be downloaded (take a while).
4526            Default: None (provided by application.properties configuration file)
4527        - "exomiser_application_properties" (file):
4528            Exomiser configuration file (see Exomiser docs).
4529            Useful to automatically download databases (especially for specific genome databases).
4530
4531        Notes:
4532        - If no sample in parameters, first sample in VCF will be chosen
4533        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4534
4535        :param threads: The number of threads to use
4536        :return: None.
4537        """
4538
4539        # DEBUG
4540        log.debug("Start annotation with Exomiser databases")
4541
4542        # Threads
4543        if not threads:
4544            threads = self.get_threads()
4545        log.debug("Threads: " + str(threads))
4546
4547        # Config
4548        config = self.get_config()
4549        log.debug("Config: " + str(config))
4550
4551        # Config - Folders - Databases
4552        databases_folders = (
4553            config.get("folders", {})
4554            .get("databases", {})
4555            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4556        )
4557        databases_folders = full_path(databases_folders)
4558        if not os.path.exists(databases_folders):
4559            log.error(f"Databases annotations: {databases_folders} NOT found")
4560        log.debug("Databases annotations: " + str(databases_folders))
4561
4562        # Config - Exomiser
4563        exomiser_bin_command = get_bin_command(
4564            bin="exomiser-cli*.jar",
4565            tool="exomiser",
4566            bin_type="jar",
4567            config=config,
4568            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4569        )
4570        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4571        if not exomiser_bin_command:
4572            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4573            log.error(msg_err)
4574            raise ValueError(msg_err)
4575
4576        # Param
4577        param = self.get_param()
4578        log.debug("Param: " + str(param))
4579
4580        # Param - Exomiser
4581        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4582        log.debug(f"Param Exomiser: {param_exomiser}")
4583
4584        # Param - Assembly
4585        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4586        log.debug("Assembly: " + str(assembly))
4587
4588        # Data
4589        table_variants = self.get_table_variants()
4590
4591        # Check if not empty
4592        log.debug("Check if not empty")
4593        sql_query_chromosomes = (
4594            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4595        )
4596        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4597            log.info(f"VCF empty")
4598            return False
4599
4600        # VCF header
4601        vcf_reader = self.get_header()
4602        log.debug("Initial header: " + str(vcf_reader.infos))
4603
4604        # Samples
4605        samples = self.get_header_sample_list()
4606        if not samples:
4607            log.error("No Samples in VCF")
4608            return False
4609        log.debug(f"Samples: {samples}")
4610
4611        # Memory limit
4612        memory_limit = self.get_memory("8G")
4613        log.debug(f"memory_limit: {memory_limit}")
4614
4615        # Exomiser java options
4616        exomiser_java_options = (
4617            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4618        )
4619        log.debug(f"Exomiser java options: {exomiser_java_options}")
4620
4621        # Download Exomiser (if not exists)
4622        exomiser_release = param_exomiser.get("release", None)
4623        exomiser_application_properties = param_exomiser.get(
4624            "exomiser_application_properties", None
4625        )
4626        databases_download_exomiser(
4627            assemblies=[assembly],
4628            exomiser_folder=databases_folders,
4629            exomiser_release=exomiser_release,
4630            exomiser_phenotype_release=exomiser_release,
4631            exomiser_application_properties=exomiser_application_properties,
4632        )
4633
4634        # Force annotation
4635        force_update_annotation = True
4636
4637        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4638            log.debug("Start annotation Exomiser")
4639
4640            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4641
4642                # tmp_dir = "/tmp/exomiser"
4643
4644                ### ANALYSIS ###
4645                ################
4646
4647                # Create analysis.json through analysis dict
4648                # either analysis in param or by default
4649                # depending on preset exome/genome)
4650
4651                # Init analysis dict
4652                param_exomiser_analysis_dict = {}
4653
4654                # analysis from param
4655                param_exomiser_analysis = param_exomiser.get("analysis", {})
4656                param_exomiser_analysis = full_path(param_exomiser_analysis)
4657
4658                # If analysis in param -> load anlaysis json
4659                if param_exomiser_analysis:
4660
4661                    # If param analysis is a file and exists
4662                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4663                        param_exomiser_analysis
4664                    ):
4665                        # Load analysis file into analysis dict (either yaml or json)
4666                        with open(param_exomiser_analysis) as json_file:
4667                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4668
4669                    # If param analysis is a dict
4670                    elif isinstance(param_exomiser_analysis, dict):
4671                        # Load analysis dict into analysis dict (either yaml or json)
4672                        param_exomiser_analysis_dict = param_exomiser_analysis
4673
4674                    # Error analysis type
4675                    else:
4676                        log.error(f"Analysis type unknown. Check param file.")
4677                        raise ValueError(f"Analysis type unknown. Check param file.")
4678
4679                # Case no input analysis config file/dict
4680                # Use preset (exome/genome) to open default config file
4681                if not param_exomiser_analysis_dict:
4682
4683                    # default preset
4684                    default_preset = "exome"
4685
4686                    # Get param preset or default preset
4687                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4688
4689                    # Try to find if preset is a file
4690                    if os.path.exists(param_exomiser_preset):
4691                        # Preset file is provided in full path
4692                        param_exomiser_analysis_default_config_file = (
4693                            param_exomiser_preset
4694                        )
4695                    # elif os.path.exists(full_path(param_exomiser_preset)):
4696                    #     # Preset file is provided in full path
4697                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4698                    elif os.path.exists(
4699                        os.path.join(folder_config, param_exomiser_preset)
4700                    ):
4701                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4702                        param_exomiser_analysis_default_config_file = os.path.join(
4703                            folder_config, param_exomiser_preset
4704                        )
4705                    else:
4706                        # Construct preset file
4707                        param_exomiser_analysis_default_config_file = os.path.join(
4708                            folder_config,
4709                            f"preset-{param_exomiser_preset}-analysis.json",
4710                        )
4711
4712                    # If preset file exists
4713                    param_exomiser_analysis_default_config_file = full_path(
4714                        param_exomiser_analysis_default_config_file
4715                    )
4716                    if os.path.exists(param_exomiser_analysis_default_config_file):
4717                        # Load prest file into analysis dict (either yaml or json)
4718                        with open(
4719                            param_exomiser_analysis_default_config_file
4720                        ) as json_file:
4721                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4722                                json_file
4723                            )
4724
4725                    # Error preset file
4726                    else:
4727                        log.error(
4728                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4729                        )
4730                        raise ValueError(
4731                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4732                        )
4733
4734                # If no analysis dict created
4735                if not param_exomiser_analysis_dict:
4736                    log.error(f"No analysis config")
4737                    raise ValueError(f"No analysis config")
4738
4739                # Log
4740                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4741
4742                ### PHENOPACKET ###
4743                ###################
4744
4745                # If no PhenoPacket in analysis dict -> check in param
4746                if "phenopacket" not in param_exomiser_analysis_dict:
4747
4748                    # If PhenoPacket in param -> load anlaysis json
4749                    if param_exomiser.get("phenopacket", None):
4750
4751                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4752                        param_exomiser_phenopacket = full_path(
4753                            param_exomiser_phenopacket
4754                        )
4755
4756                        # If param phenopacket is a file and exists
4757                        if isinstance(
4758                            param_exomiser_phenopacket, str
4759                        ) and os.path.exists(param_exomiser_phenopacket):
4760                            # Load phenopacket file into analysis dict (either yaml or json)
4761                            with open(param_exomiser_phenopacket) as json_file:
4762                                param_exomiser_analysis_dict["phenopacket"] = (
4763                                    yaml.safe_load(json_file)
4764                                )
4765
4766                        # If param phenopacket is a dict
4767                        elif isinstance(param_exomiser_phenopacket, dict):
4768                            # Load phenopacket dict into analysis dict (either yaml or json)
4769                            param_exomiser_analysis_dict["phenopacket"] = (
4770                                param_exomiser_phenopacket
4771                            )
4772
4773                        # Error phenopacket type
4774                        else:
4775                            log.error(f"Phenopacket type unknown. Check param file.")
4776                            raise ValueError(
4777                                f"Phenopacket type unknown. Check param file."
4778                            )
4779
4780                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4781                if "phenopacket" not in param_exomiser_analysis_dict:
4782
4783                    # Init PhenoPacket
4784                    param_exomiser_analysis_dict["phenopacket"] = {
4785                        "id": "analysis",
4786                        "proband": {},
4787                    }
4788
4789                    ### Add subject ###
4790
4791                    # If subject exists
4792                    param_exomiser_subject = param_exomiser.get("subject", {})
4793
4794                    # If subject not exists -> found sample ID
4795                    if not param_exomiser_subject:
4796
4797                        # Found sample ID in param
4798                        sample = param_exomiser.get("sample", None)
4799
4800                        # Find sample ID (first sample)
4801                        if not sample:
4802                            sample_list = self.get_header_sample_list()
4803                            if len(sample_list) > 0:
4804                                sample = sample_list[0]
4805                            else:
4806                                log.error(f"No sample found")
4807                                raise ValueError(f"No sample found")
4808
4809                        # Create subject
4810                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4811
4812                    # Add to dict
4813                    param_exomiser_analysis_dict["phenopacket"][
4814                        "subject"
4815                    ] = param_exomiser_subject
4816
4817                    ### Add "phenotypicFeatures" ###
4818
4819                    # If phenotypicFeatures exists
4820                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4821                        "phenotypicFeatures", []
4822                    )
4823
4824                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4825                    if not param_exomiser_phenotypicfeatures:
4826
4827                        # Found HPO in param
4828                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4829
4830                        # Split HPO if list in string format separated by comma
4831                        if isinstance(param_exomiser_hpo, str):
4832                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4833
4834                        # Create HPO list
4835                        for hpo in param_exomiser_hpo:
4836                            hpo_clean = re.sub("[^0-9]", "", hpo)
4837                            param_exomiser_phenotypicfeatures.append(
4838                                {
4839                                    "type": {
4840                                        "id": f"HP:{hpo_clean}",
4841                                        "label": f"HP:{hpo_clean}",
4842                                    }
4843                                }
4844                            )
4845
4846                    # Add to dict
4847                    param_exomiser_analysis_dict["phenopacket"][
4848                        "phenotypicFeatures"
4849                    ] = param_exomiser_phenotypicfeatures
4850
4851                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4852                    if not param_exomiser_phenotypicfeatures:
4853                        for step in param_exomiser_analysis_dict.get(
4854                            "analysis", {}
4855                        ).get("steps", []):
4856                            if "hiPhivePrioritiser" in step:
4857                                param_exomiser_analysis_dict.get("analysis", {}).get(
4858                                    "steps", []
4859                                ).remove(step)
4860
4861                ### Add Input File ###
4862
4863                # Initial file name and htsFiles
4864                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4865                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4866                    {
4867                        "uri": tmp_vcf_name,
4868                        "htsFormat": "VCF",
4869                        "genomeAssembly": assembly,
4870                    }
4871                ]
4872
4873                ### Add metaData ###
4874
4875                # If metaData not in analysis dict
4876                if "metaData" not in param_exomiser_analysis_dict:
4877                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4878                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4879                        "createdBy": "howard",
4880                        "phenopacketSchemaVersion": 1,
4881                    }
4882
4883                ### OutputOptions ###
4884
4885                # Init output result folder
4886                output_results = os.path.join(tmp_dir, "results")
4887
4888                # If no outputOptions in analysis dict
4889                if "outputOptions" not in param_exomiser_analysis_dict:
4890
4891                    # default output formats
4892                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4893
4894                    # Get outputOptions in param
4895                    output_options = param_exomiser.get("outputOptions", None)
4896
4897                    # If no output_options in param -> check
4898                    if not output_options:
4899                        output_options = {
4900                            "outputContributingVariantsOnly": False,
4901                            "numGenes": 0,
4902                            "outputFormats": defaut_output_formats,
4903                        }
4904
4905                    # Replace outputDirectory in output options
4906                    output_options["outputDirectory"] = output_results
4907                    output_options["outputFileName"] = "howard"
4908
4909                    # Add outputOptions in analysis dict
4910                    param_exomiser_analysis_dict["outputOptions"] = output_options
4911
4912                else:
4913
4914                    # Replace output_results and output format (if exists in param)
4915                    param_exomiser_analysis_dict["outputOptions"][
4916                        "outputDirectory"
4917                    ] = output_results
4918                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4919                        list(
4920                            set(
4921                                param_exomiser_analysis_dict.get(
4922                                    "outputOptions", {}
4923                                ).get("outputFormats", [])
4924                                + ["TSV_VARIANT", "VCF"]
4925                            )
4926                        )
4927                    )
4928
4929                # log
4930                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4931
4932                ### ANALYSIS FILE ###
4933                #####################
4934
4935                ### Full JSON analysis config file ###
4936
4937                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4938                with open(exomiser_analysis, "w") as fp:
4939                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4940
4941                ### SPLIT analysis and sample config files
4942
4943                # Splitted analysis dict
4944                param_exomiser_analysis_dict_for_split = (
4945                    param_exomiser_analysis_dict.copy()
4946                )
4947
4948                # Phenopacket JSON file
4949                exomiser_analysis_phenopacket = os.path.join(
4950                    tmp_dir, "analysis_phenopacket.json"
4951                )
4952                with open(exomiser_analysis_phenopacket, "w") as fp:
4953                    json.dump(
4954                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4955                        fp,
4956                        indent=4,
4957                    )
4958
4959                # Analysis JSON file without Phenopacket parameters
4960                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4961                exomiser_analysis_analysis = os.path.join(
4962                    tmp_dir, "analysis_analysis.json"
4963                )
4964                with open(exomiser_analysis_analysis, "w") as fp:
4965                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4966
4967                ### INITAL VCF file ###
4968                #######################
4969
4970                ### Create list of samples to use and include inti initial VCF file ####
4971
4972                # Subject (main sample)
4973                # Get sample ID in analysis dict
4974                sample_subject = (
4975                    param_exomiser_analysis_dict.get("phenopacket", {})
4976                    .get("subject", {})
4977                    .get("id", None)
4978                )
4979                sample_proband = (
4980                    param_exomiser_analysis_dict.get("phenopacket", {})
4981                    .get("proband", {})
4982                    .get("subject", {})
4983                    .get("id", None)
4984                )
4985                sample = []
4986                if sample_subject:
4987                    sample.append(sample_subject)
4988                if sample_proband:
4989                    sample.append(sample_proband)
4990
4991                # Get sample ID within Pedigree
4992                pedigree_persons_list = (
4993                    param_exomiser_analysis_dict.get("phenopacket", {})
4994                    .get("pedigree", {})
4995                    .get("persons", {})
4996                )
4997
4998                # Create list with all sample ID in pedigree (if exists)
4999                pedigree_persons = []
5000                for person in pedigree_persons_list:
5001                    pedigree_persons.append(person.get("individualId"))
5002
5003                # Concat subject sample ID and samples ID in pedigreesamples
5004                samples = list(set(sample + pedigree_persons))
5005
5006                # Check if sample list is not empty
5007                if not samples:
5008                    log.error(f"No samples found")
5009                    raise ValueError(f"No samples found")
5010
5011                # Create VCF with sample (either sample in param or first one by default)
5012                # Export VCF file
5013                self.export_variant_vcf(
5014                    vcf_file=tmp_vcf_name,
5015                    remove_info=True,
5016                    add_samples=True,
5017                    list_samples=samples,
5018                    index=False,
5019                )
5020
5021                ### Execute Exomiser ###
5022                ########################
5023
5024                # Init command
5025                exomiser_command = ""
5026
5027                # Command exomiser options
5028                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
5029
5030                # Release
5031                exomiser_release = param_exomiser.get("release", None)
5032                if exomiser_release:
5033                    # phenotype data version
5034                    exomiser_options += (
5035                        f" --exomiser.phenotype.data-version={exomiser_release} "
5036                    )
5037                    # data version
5038                    exomiser_options += (
5039                        f" --exomiser.{assembly}.data-version={exomiser_release} "
5040                    )
5041                    # variant white list
5042                    variant_white_list_file = (
5043                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
5044                    )
5045                    if os.path.exists(
5046                        os.path.join(
5047                            databases_folders, assembly, variant_white_list_file
5048                        )
5049                    ):
5050                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
5051
5052                # transcript_source
5053                transcript_source = param_exomiser.get(
5054                    "transcript_source", None
5055                )  # ucsc, refseq, ensembl
5056                if transcript_source:
5057                    exomiser_options += (
5058                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
5059                    )
5060
5061                # If analysis contain proband param
5062                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
5063                    "proband", {}
5064                ):
5065                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
5066
5067                # If no proband (usually uniq sample)
5068                else:
5069                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5070
5071                # Log
5072                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5073
5074                # Run command
5075                result = subprocess.call(
5076                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5077                )
5078                if result:
5079                    log.error("Exomiser command failed")
5080                    raise ValueError("Exomiser command failed")
5081
5082                ### RESULTS ###
5083                ###############
5084
5085                ### Annotate with TSV fields ###
5086
5087                # Init result tsv file
5088                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5089
5090                # Init result tsv file
5091                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5092
5093                # Parse TSV file and explode columns in INFO field
5094                if exomiser_to_info and os.path.exists(output_results_tsv):
5095
5096                    # Log
5097                    log.debug("Exomiser columns to VCF INFO field")
5098
5099                    # Retrieve columns and types
5100                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5101                    output_results_tsv_df = self.get_query_to_df(query)
5102                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5103
5104                    # Init concat fields for update
5105                    sql_query_update_concat_fields = []
5106
5107                    # Fields to avoid
5108                    fields_to_avoid = [
5109                        "CONTIG",
5110                        "START",
5111                        "END",
5112                        "REF",
5113                        "ALT",
5114                        "QUAL",
5115                        "FILTER",
5116                        "GENOTYPE",
5117                    ]
5118
5119                    # List all columns to add into header
5120                    for header_column in output_results_tsv_columns:
5121
5122                        # If header column is enable
5123                        if header_column not in fields_to_avoid:
5124
5125                            # Header info type
5126                            header_info_type = "String"
5127                            header_column_df = output_results_tsv_df[header_column]
5128                            header_column_df_dtype = header_column_df.dtype
5129                            if header_column_df_dtype == object:
5130                                if (
5131                                    pd.to_numeric(header_column_df, errors="coerce")
5132                                    .notnull()
5133                                    .all()
5134                                ):
5135                                    header_info_type = "Float"
5136                            else:
5137                                header_info_type = "Integer"
5138
5139                            # Header info
5140                            characters_to_validate = ["-"]
5141                            pattern = "[" + "".join(characters_to_validate) + "]"
5142                            header_info_name = re.sub(
5143                                pattern,
5144                                "_",
5145                                f"Exomiser_{header_column}".replace("#", ""),
5146                            )
5147                            header_info_number = "."
5148                            header_info_description = (
5149                                f"Exomiser {header_column} annotation"
5150                            )
5151                            header_info_source = "Exomiser"
5152                            header_info_version = "unknown"
5153                            header_info_code = CODE_TYPE_MAP[header_info_type]
5154                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5155                                header_info_name,
5156                                header_info_number,
5157                                header_info_type,
5158                                header_info_description,
5159                                header_info_source,
5160                                header_info_version,
5161                                header_info_code,
5162                            )
5163
5164                            # Add field to add for update to concat fields
5165                            sql_query_update_concat_fields.append(
5166                                f"""
5167                                CASE
5168                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5169                                    THEN concat(
5170                                        '{header_info_name}=',
5171                                        table_parquet."{header_column}",
5172                                        ';'
5173                                        )
5174
5175                                    ELSE ''
5176                                END
5177                            """
5178                            )
5179
5180                    # Update query
5181                    sql_query_update = f"""
5182                        UPDATE {table_variants} as table_variants
5183                            SET INFO = concat(
5184                                            CASE
5185                                                WHEN INFO NOT IN ('', '.')
5186                                                THEN INFO
5187                                                ELSE ''
5188                                            END,
5189                                            CASE
5190                                                WHEN table_variants.INFO NOT IN ('','.')
5191                                                THEN ';'
5192                                                ELSE ''
5193                                            END,
5194                                            (
5195                                            SELECT 
5196                                                concat(
5197                                                    {",".join(sql_query_update_concat_fields)}
5198                                                )
5199                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5200                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5201                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5202                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5203                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5204                                            )
5205                                        )
5206                            ;
5207                        """
5208
5209                    # Update
5210                    self.conn.execute(sql_query_update)
5211
5212                ### Annotate with VCF INFO field ###
5213
5214                # Init result VCF file
5215                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5216
5217                # If VCF exists
5218                if os.path.exists(output_results_vcf):
5219
5220                    # Log
5221                    log.debug("Exomiser result VCF update variants")
5222
5223                    # Find Exomiser INFO field annotation in header
5224                    with gzip.open(output_results_vcf, "rt") as f:
5225                        header_list = self.read_vcf_header(f)
5226                    exomiser_vcf_header = vcf.Reader(
5227                        io.StringIO("\n".join(header_list))
5228                    )
5229
5230                    # Add annotation INFO field to header
5231                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5232
5233                    # Update variants with VCF
5234                    self.update_from_vcf(output_results_vcf)
5235
5236        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5238    def annotation_snpeff(self, threads: int = None) -> None:
5239        """
5240        This function annotate with snpEff
5241
5242        :param threads: The number of threads to use
5243        :return: the value of the variable "return_value".
5244        """
5245
5246        # DEBUG
5247        log.debug("Start annotation with snpeff databases")
5248
5249        # Threads
5250        if not threads:
5251            threads = self.get_threads()
5252        log.debug("Threads: " + str(threads))
5253
5254        # DEBUG
5255        delete_tmp = True
5256        if self.get_config().get("verbosity", "warning") in ["debug"]:
5257            delete_tmp = False
5258            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5259
5260        # Config
5261        config = self.get_config()
5262        log.debug("Config: " + str(config))
5263
5264        # Config - Folders - Databases
5265        databases_folders = (
5266            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5267        )
5268        log.debug("Databases annotations: " + str(databases_folders))
5269
5270        # Config - snpEff bin command
5271        snpeff_bin_command = get_bin_command(
5272            bin="snpEff.jar",
5273            tool="snpeff",
5274            bin_type="jar",
5275            config=config,
5276            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5277        )
5278        if not snpeff_bin_command:
5279            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5280            log.error(msg_err)
5281            raise ValueError(msg_err)
5282
5283        # Config - snpEff databases
5284        snpeff_databases = (
5285            config.get("folders", {})
5286            .get("databases", {})
5287            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5288        )
5289        snpeff_databases = full_path(snpeff_databases)
5290        if snpeff_databases is not None and snpeff_databases != "":
5291            log.debug(f"Create snpEff databases folder")
5292            if not os.path.exists(snpeff_databases):
5293                os.makedirs(snpeff_databases)
5294
5295        # Param
5296        param = self.get_param()
5297        log.debug("Param: " + str(param))
5298
5299        # Param
5300        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5301        log.debug("Options: " + str(options))
5302
5303        # Param - Assembly
5304        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5305
5306        # Param - Options
5307        snpeff_options = (
5308            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5309        )
5310        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5311        snpeff_csvstats = (
5312            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5313        )
5314        if snpeff_stats:
5315            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5316            snpeff_stats = full_path(snpeff_stats)
5317            snpeff_options += f" -stats {snpeff_stats}"
5318        if snpeff_csvstats:
5319            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5320            snpeff_csvstats = full_path(snpeff_csvstats)
5321            snpeff_options += f" -csvStats {snpeff_csvstats}"
5322
5323        # Data
5324        table_variants = self.get_table_variants()
5325
5326        # Check if not empty
5327        log.debug("Check if not empty")
5328        sql_query_chromosomes = (
5329            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5330        )
5331        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5332        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5333            log.info(f"VCF empty")
5334            return
5335
5336        # Export in VCF
5337        log.debug("Create initial file to annotate")
5338        tmp_vcf = NamedTemporaryFile(
5339            prefix=self.get_prefix(),
5340            dir=self.get_tmp_dir(),
5341            suffix=".vcf.gz",
5342            delete=True,
5343        )
5344        tmp_vcf_name = tmp_vcf.name
5345
5346        # VCF header
5347        vcf_reader = self.get_header()
5348        log.debug("Initial header: " + str(vcf_reader.infos))
5349
5350        # Existing annotations
5351        for vcf_annotation in self.get_header().infos:
5352
5353            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5354            log.debug(
5355                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5356            )
5357
5358        # Memory limit
5359        # if config.get("memory", None):
5360        #     memory_limit = config.get("memory", "8G")
5361        # else:
5362        #     memory_limit = "8G"
5363        memory_limit = self.get_memory("8G")
5364        log.debug(f"memory_limit: {memory_limit}")
5365
5366        # snpEff java options
5367        snpeff_java_options = (
5368            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5369        )
5370        log.debug(f"Exomiser java options: {snpeff_java_options}")
5371
5372        force_update_annotation = True
5373
5374        if "ANN" not in self.get_header().infos or force_update_annotation:
5375
5376            # Check snpEff database
5377            log.debug(f"Check snpEff databases {[assembly]}")
5378            databases_download_snpeff(
5379                folder=snpeff_databases, assemblies=[assembly], config=config
5380            )
5381
5382            # Export VCF file
5383            self.export_variant_vcf(
5384                vcf_file=tmp_vcf_name,
5385                remove_info=True,
5386                add_samples=False,
5387                index=True,
5388            )
5389
5390            # Tmp file
5391            err_files = []
5392            tmp_annotate_vcf = NamedTemporaryFile(
5393                prefix=self.get_prefix(),
5394                dir=self.get_tmp_dir(),
5395                suffix=".vcf",
5396                delete=False,
5397            )
5398            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5399            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5400            err_files.append(tmp_annotate_vcf_name_err)
5401
5402            # Command
5403            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5404            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5405            run_parallel_commands([snpeff_command], 1)
5406
5407            # Error messages
5408            log.info(f"Error/Warning messages:")
5409            error_message_command_all = []
5410            error_message_command_warning = []
5411            error_message_command_err = []
5412            for err_file in err_files:
5413                with open(err_file, "r") as f:
5414                    for line in f:
5415                        message = line.strip()
5416                        error_message_command_all.append(message)
5417                        if line.startswith("[W::"):
5418                            error_message_command_warning.append(message)
5419                        if line.startswith("[E::"):
5420                            error_message_command_err.append(f"{err_file}: " + message)
5421            # log info
5422            for message in list(
5423                set(error_message_command_err + error_message_command_warning)
5424            ):
5425                log.info(f"   {message}")
5426            # debug info
5427            for message in list(set(error_message_command_all)):
5428                log.debug(f"   {message}")
5429            # failed
5430            if len(error_message_command_err):
5431                log.error("Annotation failed: Error in commands")
5432                raise ValueError("Annotation failed: Error in commands")
5433
5434            # Find annotation in header
5435            with open(tmp_annotate_vcf_name, "rt") as f:
5436                header_list = self.read_vcf_header(f)
5437            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5438
5439            for ann in annovar_vcf_header.infos:
5440                if ann not in self.get_header().infos:
5441                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5442
5443            # Update variants
5444            log.info(f"Annotation - Updating...")
5445            self.update_from_vcf(tmp_annotate_vcf_name)
5446
5447        else:
5448            if "ANN" in self.get_header().infos:
5449                log.debug(f"Existing snpEff annotations in VCF")
5450            if force_update_annotation:
5451                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5453    def annotation_annovar(self, threads: int = None) -> None:
5454        """
5455        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5456        annotations
5457
5458        :param threads: number of threads to use
5459        :return: the value of the variable "return_value".
5460        """
5461
5462        # DEBUG
5463        log.debug("Start annotation with Annovar databases")
5464
5465        # Threads
5466        if not threads:
5467            threads = self.get_threads()
5468        log.debug("Threads: " + str(threads))
5469
5470        # Tmp en Err files
5471        tmp_files = []
5472        err_files = []
5473
5474        # DEBUG
5475        delete_tmp = True
5476        if self.get_config().get("verbosity", "warning") in ["debug"]:
5477            delete_tmp = False
5478            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5479
5480        # Config
5481        config = self.get_config()
5482        log.debug("Config: " + str(config))
5483
5484        # Config - Folders - Databases
5485        databases_folders = (
5486            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5487        )
5488        log.debug("Databases annotations: " + str(databases_folders))
5489
5490        # Config - annovar bin command
5491        annovar_bin_command = get_bin_command(
5492            bin="table_annovar.pl",
5493            tool="annovar",
5494            bin_type="perl",
5495            config=config,
5496            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5497        )
5498        if not annovar_bin_command:
5499            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5500            log.error(msg_err)
5501            raise ValueError(msg_err)
5502
5503        # Config - BCFTools bin command
5504        bcftools_bin_command = get_bin_command(
5505            bin="bcftools",
5506            tool="bcftools",
5507            bin_type="bin",
5508            config=config,
5509            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5510        )
5511        if not bcftools_bin_command:
5512            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5513            log.error(msg_err)
5514            raise ValueError(msg_err)
5515
5516        # Config - annovar databases
5517        annovar_databases = (
5518            config.get("folders", {})
5519            .get("databases", {})
5520            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5521        )
5522        if annovar_databases is not None:
5523            if isinstance(annovar_databases, list):
5524                annovar_databases = full_path(annovar_databases[0])
5525                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5526            annovar_databases = full_path(annovar_databases)
5527            if not os.path.exists(annovar_databases):
5528                log.info(f"Annovar databases folder '{annovar_databases}' created")
5529                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5530        else:
5531            msg_err = f"Annovar databases configuration failed"
5532            log.error(msg_err)
5533            raise ValueError(msg_err)
5534
5535        # Param
5536        param = self.get_param()
5537        log.debug("Param: " + str(param))
5538
5539        # Param - options
5540        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5541        log.debug("Options: " + str(options))
5542
5543        # Param - annotations
5544        annotations = (
5545            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5546        )
5547        log.debug("Annotations: " + str(annotations))
5548
5549        # Param - Assembly
5550        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5551
5552        # Annovar database assembly
5553        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5554        if annovar_databases_assembly != "" and not os.path.exists(
5555            annovar_databases_assembly
5556        ):
5557            os.makedirs(annovar_databases_assembly)
5558
5559        # Data
5560        table_variants = self.get_table_variants()
5561
5562        # Check if not empty
5563        log.debug("Check if not empty")
5564        sql_query_chromosomes = (
5565            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5566        )
5567        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5568        if not sql_query_chromosomes_df["count"][0]:
5569            log.info(f"VCF empty")
5570            return
5571
5572        # VCF header
5573        vcf_reader = self.get_header()
5574        log.debug("Initial header: " + str(vcf_reader.infos))
5575
5576        # Existing annotations
5577        for vcf_annotation in self.get_header().infos:
5578
5579            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5580            log.debug(
5581                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5582            )
5583
5584        force_update_annotation = True
5585
5586        if annotations:
5587
5588            commands = []
5589            tmp_annotates_vcf_name_list = []
5590
5591            # Export in VCF
5592            log.debug("Create initial file to annotate")
5593            tmp_vcf = NamedTemporaryFile(
5594                prefix=self.get_prefix(),
5595                dir=self.get_tmp_dir(),
5596                suffix=".vcf.gz",
5597                delete=False,
5598            )
5599            tmp_vcf_name = tmp_vcf.name
5600            tmp_files.append(tmp_vcf_name)
5601            tmp_files.append(tmp_vcf_name + ".tbi")
5602
5603            # Export VCF file
5604            self.export_variant_vcf(
5605                vcf_file=tmp_vcf_name,
5606                remove_info=".",
5607                add_samples=False,
5608                index=True,
5609            )
5610
5611            # Create file for field rename
5612            log.debug("Create file for field rename")
5613            tmp_rename = NamedTemporaryFile(
5614                prefix=self.get_prefix(),
5615                dir=self.get_tmp_dir(),
5616                suffix=".rename",
5617                delete=False,
5618            )
5619            tmp_rename_name = tmp_rename.name
5620            tmp_files.append(tmp_rename_name)
5621
5622            # Check Annovar database
5623            log.debug(
5624                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5625            )
5626            databases_download_annovar(
5627                folder=annovar_databases,
5628                files=list(annotations.keys()),
5629                assemblies=[assembly],
5630            )
5631
5632            for annotation in annotations:
5633                annotation_fields = annotations[annotation]
5634
5635                if not annotation_fields:
5636                    annotation_fields = {"INFO": None}
5637
5638                log.info(f"Annotations Annovar - database '{annotation}'")
5639                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5640
5641                # Tmp file for annovar
5642                err_files = []
5643                tmp_annotate_vcf_directory = TemporaryDirectory(
5644                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5645                )
5646                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5647                tmp_annotate_vcf_name_annovar = (
5648                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5649                )
5650                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5651                err_files.append(tmp_annotate_vcf_name_err)
5652                tmp_files.append(tmp_annotate_vcf_name_err)
5653
5654                # Tmp file final vcf annotated by annovar
5655                tmp_annotate_vcf = NamedTemporaryFile(
5656                    prefix=self.get_prefix(),
5657                    dir=self.get_tmp_dir(),
5658                    suffix=".vcf.gz",
5659                    delete=False,
5660                )
5661                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5662                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5663                tmp_files.append(tmp_annotate_vcf_name)
5664                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5665
5666                # Number of fields
5667                annotation_list = []
5668                annotation_renamed_list = []
5669
5670                for annotation_field in annotation_fields:
5671
5672                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5673                    annotation_fields_new_name = annotation_fields.get(
5674                        annotation_field, annotation_field
5675                    )
5676                    if not annotation_fields_new_name:
5677                        annotation_fields_new_name = annotation_field
5678
5679                    if (
5680                        force_update_annotation
5681                        or annotation_fields_new_name not in self.get_header().infos
5682                    ):
5683                        annotation_list.append(annotation_field)
5684                        annotation_renamed_list.append(annotation_fields_new_name)
5685                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5686                        log.warning(
5687                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5688                        )
5689
5690                    # Add rename info
5691                    run_parallel_commands(
5692                        [
5693                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5694                        ],
5695                        1,
5696                    )
5697
5698                # log.debug("fields_to_removed: " + str(fields_to_removed))
5699                log.debug("annotation_list: " + str(annotation_list))
5700
5701                # protocol
5702                protocol = annotation
5703
5704                # argument
5705                argument = ""
5706
5707                # operation
5708                operation = "f"
5709                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5710                    "ensGene"
5711                ):
5712                    operation = "g"
5713                    if options.get("genebase", None):
5714                        argument = f"""'{options.get("genebase","")}'"""
5715                elif annotation in ["cytoBand"]:
5716                    operation = "r"
5717
5718                # argument option
5719                argument_option = ""
5720                if argument != "":
5721                    argument_option = " --argument " + argument
5722
5723                # command options
5724                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5725                for option in options:
5726                    if option not in ["genebase"]:
5727                        command_options += f""" --{option}={options[option]}"""
5728
5729                # Command
5730
5731                # Command - Annovar
5732                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5733                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5734
5735                # Command - start pipe
5736                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5737
5738                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5739                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5740
5741                # Command - Special characters (refGene annotation)
5742                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5743
5744                # Command - Clean empty fields (with value ".")
5745                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5746
5747                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5748                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5749                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5750                    # for ann in annotation_renamed_list:
5751                    for ann in annotation_list:
5752                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5753
5754                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5755
5756                # Command - indexing
5757                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5758
5759                log.debug(f"Annotation - Annovar command: {command_annovar}")
5760                run_parallel_commands([command_annovar], 1)
5761
5762                # Error messages
5763                log.info(f"Error/Warning messages:")
5764                error_message_command_all = []
5765                error_message_command_warning = []
5766                error_message_command_err = []
5767                for err_file in err_files:
5768                    with open(err_file, "r") as f:
5769                        for line in f:
5770                            message = line.strip()
5771                            error_message_command_all.append(message)
5772                            if line.startswith("[W::") or line.startswith("WARNING"):
5773                                error_message_command_warning.append(message)
5774                            if line.startswith("[E::") or line.startswith("ERROR"):
5775                                error_message_command_err.append(
5776                                    f"{err_file}: " + message
5777                                )
5778                # log info
5779                for message in list(
5780                    set(error_message_command_err + error_message_command_warning)
5781                ):
5782                    log.info(f"   {message}")
5783                # debug info
5784                for message in list(set(error_message_command_all)):
5785                    log.debug(f"   {message}")
5786                # failed
5787                if len(error_message_command_err):
5788                    log.error("Annotation failed: Error in commands")
5789                    raise ValueError("Annotation failed: Error in commands")
5790
5791            if tmp_annotates_vcf_name_list:
5792
5793                # List of annotated files
5794                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5795
5796                # Tmp file
5797                tmp_annotate_vcf = NamedTemporaryFile(
5798                    prefix=self.get_prefix(),
5799                    dir=self.get_tmp_dir(),
5800                    suffix=".vcf.gz",
5801                    delete=False,
5802                )
5803                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5804                tmp_files.append(tmp_annotate_vcf_name)
5805                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5806                err_files.append(tmp_annotate_vcf_name_err)
5807                tmp_files.append(tmp_annotate_vcf_name_err)
5808
5809                # Command merge
5810                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5811                log.info(
5812                    f"Annotation Annovar - Annotation merging "
5813                    + str(len(tmp_annotates_vcf_name_list))
5814                    + " annotated files"
5815                )
5816                log.debug(f"Annotation - merge command: {merge_command}")
5817                run_parallel_commands([merge_command], 1)
5818
5819                # Find annotation in header
5820                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5821                    header_list = self.read_vcf_header(f)
5822                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5823
5824                for ann in annovar_vcf_header.infos:
5825                    if ann not in self.get_header().infos:
5826                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5827
5828                # Update variants
5829                log.info(f"Annotation Annovar - Updating...")
5830                self.update_from_vcf(tmp_annotate_vcf_name)
5831
5832            # Clean files
5833            # Tmp file remove command
5834            if True:
5835                tmp_files_remove_command = ""
5836                if tmp_files:
5837                    tmp_files_remove_command = " ".join(tmp_files)
5838                clean_command = f" rm -f {tmp_files_remove_command} "
5839                log.debug(f"Annotation Annovar - Annotation cleaning ")
5840                log.debug(f"Annotation - cleaning command: {clean_command}")
5841                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5844    def annotation_parquet(self, threads: int = None) -> None:
5845        """
5846        It takes a VCF file, and annotates it with a parquet file
5847
5848        :param threads: number of threads to use for the annotation
5849        :return: the value of the variable "result".
5850        """
5851
5852        # DEBUG
5853        log.debug("Start annotation with parquet databases")
5854
5855        # Threads
5856        if not threads:
5857            threads = self.get_threads()
5858        log.debug("Threads: " + str(threads))
5859
5860        # DEBUG
5861        delete_tmp = True
5862        if self.get_config().get("verbosity", "warning") in ["debug"]:
5863            delete_tmp = False
5864            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5865
5866        # Config
5867        databases_folders = set(
5868            self.get_config()
5869            .get("folders", {})
5870            .get("databases", {})
5871            .get("annotations", ["."])
5872            + self.get_config()
5873            .get("folders", {})
5874            .get("databases", {})
5875            .get("parquet", ["."])
5876        )
5877        log.debug("Databases annotations: " + str(databases_folders))
5878
5879        # Param
5880        annotations = (
5881            self.get_param()
5882            .get("annotation", {})
5883            .get("parquet", {})
5884            .get("annotations", None)
5885        )
5886        log.debug("Annotations: " + str(annotations))
5887
5888        # Assembly
5889        assembly = self.get_param().get(
5890            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5891        )
5892
5893        # Force Update Annotation
5894        force_update_annotation = (
5895            self.get_param()
5896            .get("annotation", {})
5897            .get("options", {})
5898            .get("annotations_update", False)
5899        )
5900        log.debug(f"force_update_annotation={force_update_annotation}")
5901        force_append_annotation = (
5902            self.get_param()
5903            .get("annotation", {})
5904            .get("options", {})
5905            .get("annotations_append", False)
5906        )
5907        log.debug(f"force_append_annotation={force_append_annotation}")
5908
5909        # Data
5910        table_variants = self.get_table_variants()
5911
5912        # Check if not empty
5913        log.debug("Check if not empty")
5914        sql_query_chromosomes_df = self.get_query_to_df(
5915            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5916        )
5917        if not sql_query_chromosomes_df["count"][0]:
5918            log.info(f"VCF empty")
5919            return
5920
5921        # VCF header
5922        vcf_reader = self.get_header()
5923        log.debug("Initial header: " + str(vcf_reader.infos))
5924
5925        # Nb Variants POS
5926        log.debug("NB Variants Start")
5927        nb_variants = self.conn.execute(
5928            f"SELECT count(*) AS count FROM variants"
5929        ).fetchdf()["count"][0]
5930        log.debug("NB Variants Stop")
5931
5932        # Existing annotations
5933        for vcf_annotation in self.get_header().infos:
5934
5935            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5936            log.debug(
5937                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5938            )
5939
5940        # Added columns
5941        added_columns = []
5942
5943        # drop indexes
5944        log.debug(f"Drop indexes...")
5945        self.drop_indexes()
5946
5947        if annotations:
5948
5949            if "ALL" in annotations:
5950
5951                all_param = annotations.get("ALL", {})
5952                all_param_formats = all_param.get("formats", None)
5953                all_param_releases = all_param.get("releases", None)
5954
5955                databases_infos_dict = self.scan_databases(
5956                    database_formats=all_param_formats,
5957                    database_releases=all_param_releases,
5958                )
5959                for database_infos in databases_infos_dict.keys():
5960                    if database_infos not in annotations:
5961                        annotations[database_infos] = {"INFO": None}
5962
5963            for annotation in annotations:
5964
5965                if annotation in ["ALL"]:
5966                    continue
5967
5968                # Annotation Name
5969                annotation_name = os.path.basename(annotation)
5970
5971                # Annotation fields
5972                annotation_fields = annotations[annotation]
5973                if not annotation_fields:
5974                    annotation_fields = {"INFO": None}
5975
5976                log.debug(f"Annotation '{annotation_name}'")
5977                log.debug(
5978                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5979                )
5980
5981                # Create Database
5982                database = Database(
5983                    database=annotation,
5984                    databases_folders=databases_folders,
5985                    assembly=assembly,
5986                )
5987
5988                # Find files
5989                parquet_file = database.get_database()
5990                parquet_hdr_file = database.get_header_file()
5991                parquet_type = database.get_type()
5992
5993                # Check if files exists
5994                if not parquet_file or not parquet_hdr_file:
5995                    msg_err_list = []
5996                    if not parquet_file:
5997                        msg_err_list.append(
5998                            f"Annotation failed: Annotation file not found"
5999                        )
6000                    if parquet_file and not parquet_hdr_file:
6001                        msg_err_list.append(
6002                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
6003                        )
6004
6005                    log.error(". ".join(msg_err_list))
6006                    raise ValueError(". ".join(msg_err_list))
6007                else:
6008                    # Get parquet connexion
6009                    parquet_sql_attach = database.get_sql_database_attach(
6010                        output="query"
6011                    )
6012                    if parquet_sql_attach:
6013                        self.conn.execute(parquet_sql_attach)
6014                    parquet_file_link = database.get_sql_database_link()
6015                    # Log
6016                    log.debug(
6017                        f"Annotation '{annotation_name}' - file: "
6018                        + str(parquet_file)
6019                        + " and "
6020                        + str(parquet_hdr_file)
6021                    )
6022
6023                    # Database full header columns
6024                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
6025                        parquet_hdr_file
6026                    )
6027                    # Log
6028                    log.debug(
6029                        "Annotation database header columns : "
6030                        + str(parquet_hdr_vcf_header_columns)
6031                    )
6032
6033                    # Load header as VCF object
6034                    parquet_hdr_vcf_header_infos = database.get_header().infos
6035                    # Log
6036                    log.debug(
6037                        "Annotation database header: "
6038                        + str(parquet_hdr_vcf_header_infos)
6039                    )
6040
6041                    # Get extra infos
6042                    parquet_columns = database.get_extra_columns()
6043                    # Log
6044                    log.debug("Annotation database Columns: " + str(parquet_columns))
6045
6046                    # Add extra columns if "ALL" in annotation_fields
6047                    # if "ALL" in annotation_fields:
6048                    #     allow_add_extra_column = True
6049                    if "ALL" in annotation_fields and database.get_extra_columns():
6050                        for extra_column in database.get_extra_columns():
6051                            if (
6052                                extra_column not in annotation_fields
6053                                and extra_column.replace("INFO/", "")
6054                                not in parquet_hdr_vcf_header_infos
6055                            ):
6056                                parquet_hdr_vcf_header_infos[extra_column] = (
6057                                    vcf.parser._Info(
6058                                        extra_column,
6059                                        ".",
6060                                        "String",
6061                                        f"{extra_column} description",
6062                                        "unknown",
6063                                        "unknown",
6064                                        self.code_type_map["String"],
6065                                    )
6066                                )
6067
6068                    # For all fields in database
6069                    annotation_fields_all = False
6070                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6071                        annotation_fields_all = True
6072                        annotation_fields = {
6073                            key: key for key in parquet_hdr_vcf_header_infos
6074                        }
6075
6076                        log.debug(
6077                            "Annotation database header - All annotations added: "
6078                            + str(annotation_fields)
6079                        )
6080
6081                    # Init
6082
6083                    # List of annotation fields to use
6084                    sql_query_annotation_update_info_sets = []
6085
6086                    # List of annotation to agregate
6087                    sql_query_annotation_to_agregate = []
6088
6089                    # Number of fields
6090                    nb_annotation_field = 0
6091
6092                    # Annotation fields processed
6093                    annotation_fields_processed = []
6094
6095                    # Columns mapping
6096                    map_columns = database.map_columns(
6097                        columns=annotation_fields, prefixes=["INFO/"]
6098                    )
6099
6100                    # Query dict for fields to remove (update option)
6101                    query_dict_remove = {}
6102
6103                    # Fetch Anotation fields
6104                    for annotation_field in annotation_fields:
6105
6106                        # annotation_field_column
6107                        annotation_field_column = map_columns.get(
6108                            annotation_field, "INFO"
6109                        )
6110
6111                        # field new name, if parametered
6112                        annotation_fields_new_name = annotation_fields.get(
6113                            annotation_field, annotation_field
6114                        )
6115                        if not annotation_fields_new_name:
6116                            annotation_fields_new_name = annotation_field
6117
6118                        # To annotate
6119                        # force_update_annotation = True
6120                        # force_append_annotation = True
6121                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6122                        if annotation_field in parquet_hdr_vcf_header_infos and (
6123                            force_update_annotation
6124                            or force_append_annotation
6125                            or (
6126                                annotation_fields_new_name
6127                                not in self.get_header().infos
6128                            )
6129                        ):
6130
6131                            # Add field to annotation to process list
6132                            annotation_fields_processed.append(
6133                                annotation_fields_new_name
6134                            )
6135
6136                            # explode infos for the field
6137                            annotation_fields_new_name_info_msg = ""
6138                            if (
6139                                force_update_annotation
6140                                and annotation_fields_new_name
6141                                in self.get_header().infos
6142                            ):
6143                                # Remove field from INFO
6144                                query = f"""
6145                                    UPDATE {table_variants} as table_variants
6146                                    SET INFO = REGEXP_REPLACE(
6147                                                concat(table_variants.INFO,''),
6148                                                ';*{annotation_fields_new_name}=[^;]*',
6149                                                ''
6150                                                )
6151                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6152                                """
6153                                annotation_fields_new_name_info_msg = " [update]"
6154                                query_dict_remove[
6155                                    f"remove 'INFO/{annotation_fields_new_name}'"
6156                                ] = query
6157
6158                            # Sep between fields in INFO
6159                            nb_annotation_field += 1
6160                            if nb_annotation_field > 1:
6161                                annotation_field_sep = ";"
6162                            else:
6163                                annotation_field_sep = ""
6164
6165                            log.info(
6166                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6167                            )
6168
6169                            # Add INFO field to header
6170                            parquet_hdr_vcf_header_infos_number = (
6171                                parquet_hdr_vcf_header_infos[annotation_field].num
6172                                or "."
6173                            )
6174                            parquet_hdr_vcf_header_infos_type = (
6175                                parquet_hdr_vcf_header_infos[annotation_field].type
6176                                or "String"
6177                            )
6178                            parquet_hdr_vcf_header_infos_description = (
6179                                parquet_hdr_vcf_header_infos[annotation_field].desc
6180                                or f"{annotation_field} description"
6181                            )
6182                            parquet_hdr_vcf_header_infos_source = (
6183                                parquet_hdr_vcf_header_infos[annotation_field].source
6184                                or "unknown"
6185                            )
6186                            parquet_hdr_vcf_header_infos_version = (
6187                                parquet_hdr_vcf_header_infos[annotation_field].version
6188                                or "unknown"
6189                            )
6190
6191                            vcf_reader.infos[annotation_fields_new_name] = (
6192                                vcf.parser._Info(
6193                                    annotation_fields_new_name,
6194                                    parquet_hdr_vcf_header_infos_number,
6195                                    parquet_hdr_vcf_header_infos_type,
6196                                    parquet_hdr_vcf_header_infos_description,
6197                                    parquet_hdr_vcf_header_infos_source,
6198                                    parquet_hdr_vcf_header_infos_version,
6199                                    self.code_type_map[
6200                                        parquet_hdr_vcf_header_infos_type
6201                                    ],
6202                                )
6203                            )
6204
6205                            # Append
6206                            if force_append_annotation:
6207                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6208                            else:
6209                                query_case_when_append = ""
6210
6211                            # Annotation/Update query fields
6212                            # Found in INFO column
6213                            if (
6214                                annotation_field_column == "INFO"
6215                                and "INFO" in parquet_hdr_vcf_header_columns
6216                            ):
6217                                sql_query_annotation_update_info_sets.append(
6218                                    f"""
6219                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6220                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6221                                        ELSE ''
6222                                    END
6223                                """
6224                                )
6225                            # Found in a specific column
6226                            else:
6227                                sql_query_annotation_update_info_sets.append(
6228                                    f"""
6229                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6230                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6231                                        ELSE ''
6232                                    END
6233                                """
6234                                )
6235                                sql_query_annotation_to_agregate.append(
6236                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6237                                )
6238
6239                        # Not to annotate
6240                        else:
6241
6242                            if force_update_annotation:
6243                                annotation_message = "forced"
6244                            else:
6245                                annotation_message = "skipped"
6246
6247                            if annotation_field not in parquet_hdr_vcf_header_infos:
6248                                log.warning(
6249                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6250                                )
6251                            if annotation_fields_new_name in self.get_header().infos:
6252                                log.warning(
6253                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6254                                )
6255
6256                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6257                    # allow_annotation_full_info = True
6258                    allow_annotation_full_info = not force_append_annotation
6259
6260                    if parquet_type in ["regions"]:
6261                        allow_annotation_full_info = False
6262
6263                    if (
6264                        allow_annotation_full_info
6265                        and nb_annotation_field == len(annotation_fields)
6266                        and annotation_fields_all
6267                        and (
6268                            "INFO" in parquet_hdr_vcf_header_columns
6269                            and "INFO" in database.get_extra_columns()
6270                        )
6271                    ):
6272                        log.debug("Column INFO annotation enabled")
6273                        sql_query_annotation_update_info_sets = []
6274                        sql_query_annotation_update_info_sets.append(
6275                            f" table_parquet.INFO "
6276                        )
6277
6278                    if sql_query_annotation_update_info_sets:
6279
6280                        # Annotate
6281                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6282
6283                        # Join query annotation update info sets for SQL
6284                        sql_query_annotation_update_info_sets_sql = ",".join(
6285                            sql_query_annotation_update_info_sets
6286                        )
6287
6288                        # Check chromosomes list (and variants infos)
6289                        sql_query_chromosomes = f"""
6290                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6291                            FROM {table_variants} as table_variants
6292                            GROUP BY table_variants."#CHROM"
6293                            ORDER BY table_variants."#CHROM"
6294                            """
6295                        sql_query_chromosomes_df = self.conn.execute(
6296                            sql_query_chromosomes
6297                        ).df()
6298                        sql_query_chromosomes_dict = {
6299                            entry["CHROM"]: {
6300                                "count": entry["count_variants"],
6301                                "min": entry["min_variants"],
6302                                "max": entry["max_variants"],
6303                            }
6304                            for index, entry in sql_query_chromosomes_df.iterrows()
6305                        }
6306
6307                        # Init
6308                        nb_of_query = 0
6309                        nb_of_variant_annotated = 0
6310                        query_dict = query_dict_remove
6311
6312                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6313                        for chrom in sql_query_chromosomes_dict:
6314
6315                            # Number of variant by chromosome
6316                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6317                                chrom, {}
6318                            ).get("count", 0)
6319
6320                            log.debug(
6321                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6322                            )
6323
6324                            # Annotation with regions database
6325                            if parquet_type in ["regions"]:
6326                                sql_query_annotation_from_clause = f"""
6327                                    FROM (
6328                                        SELECT 
6329                                            '{chrom}' AS \"#CHROM\",
6330                                            table_variants_from.\"POS\" AS \"POS\",
6331                                            {",".join(sql_query_annotation_to_agregate)}
6332                                        FROM {table_variants} as table_variants_from
6333                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6334                                            table_parquet_from."#CHROM" = '{chrom}'
6335                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6336                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6337                                        )
6338                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6339                                        GROUP BY table_variants_from.\"POS\"
6340                                        )
6341                                        as table_parquet
6342                                """
6343
6344                                sql_query_annotation_where_clause = """
6345                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6346                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6347                                """
6348
6349                            # Annotation with variants database
6350                            else:
6351                                sql_query_annotation_from_clause = f"""
6352                                    FROM {parquet_file_link} as table_parquet
6353                                """
6354                                sql_query_annotation_where_clause = f"""
6355                                    table_variants."#CHROM" = '{chrom}'
6356                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6357                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6358                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6359                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6360                                """
6361
6362                            # Create update query
6363                            sql_query_annotation_chrom_interval_pos = f"""
6364                                UPDATE {table_variants} as table_variants
6365                                    SET INFO = 
6366                                        concat(
6367                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6368                                                THEN table_variants.INFO
6369                                                ELSE ''
6370                                            END
6371                                            ,
6372                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6373                                                        AND (
6374                                                        concat({sql_query_annotation_update_info_sets_sql})
6375                                                        )
6376                                                        NOT IN ('','.') 
6377                                                    THEN ';'
6378                                                    ELSE ''
6379                                            END
6380                                            ,
6381                                            {sql_query_annotation_update_info_sets_sql}
6382                                            )
6383                                    {sql_query_annotation_from_clause}
6384                                    WHERE {sql_query_annotation_where_clause}
6385                                    ;
6386                                """
6387
6388                            # Add update query to dict
6389                            query_dict[
6390                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6391                            ] = sql_query_annotation_chrom_interval_pos
6392
6393                        nb_of_query = len(query_dict)
6394                        num_query = 0
6395
6396                        # SET max_expression_depth TO x
6397                        self.conn.execute("SET max_expression_depth TO 10000")
6398
6399                        for query_name in query_dict:
6400                            query = query_dict[query_name]
6401                            num_query += 1
6402                            log.info(
6403                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6404                            )
6405                            result = self.conn.execute(query)
6406                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6407                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6408                            log.info(
6409                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6410                            )
6411
6412                        log.info(
6413                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6414                        )
6415
6416                    else:
6417
6418                        log.info(
6419                            f"Annotation '{annotation_name}' - No Annotations available"
6420                        )
6421
6422                    log.debug("Final header: " + str(vcf_reader.infos))
6423
6424        # Remove added columns
6425        for added_column in added_columns:
6426            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6428    def annotation_splice(self, threads: int = None) -> None:
6429        """
6430        This function annotate with snpEff
6431
6432        :param threads: The number of threads to use
6433        :return: the value of the variable "return_value".
6434        """
6435
6436        # DEBUG
6437        log.debug("Start annotation with splice tools")
6438
6439        # Threads
6440        if not threads:
6441            threads = self.get_threads()
6442        log.debug("Threads: " + str(threads))
6443
6444        # DEBUG
6445        delete_tmp = True
6446        if self.get_config().get("verbosity", "warning") in ["debug"]:
6447            delete_tmp = False
6448            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6449
6450        # Config
6451        config = self.get_config()
6452        log.debug("Config: " + str(config))
6453        splice_config = config.get("tools", {}).get("splice", {})
6454        if not splice_config:
6455            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6456            msg_err = "No Splice tool config"
6457            raise ValueError(msg_err)
6458        log.debug(f"splice_config: {splice_config}")
6459
6460        # Config - Folders - Databases
6461        databases_folders = (
6462            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6463        )
6464        log.debug("Databases annotations: " + str(databases_folders))
6465
6466        # Splice docker image
6467        splice_docker_image = splice_config.get("docker").get("image")
6468
6469        # Pull splice image if it's not already there
6470        if not check_docker_image_exists(splice_docker_image):
6471            log.warning(
6472                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6473            )
6474            try:
6475                command(f"docker pull {splice_config.get('docker').get('image')}")
6476            except subprocess.CalledProcessError:
6477                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6478                log.error(msg_err)
6479                raise ValueError(msg_err)
6480
6481        # Config - splice databases
6482        splice_databases = (
6483            config.get("folders", {})
6484            .get("databases", {})
6485            .get("splice", DEFAULT_SPLICE_FOLDER)
6486        )
6487        splice_databases = full_path(splice_databases)
6488
6489        # Param
6490        param = self.get_param()
6491        log.debug("Param: " + str(param))
6492
6493        # Param
6494        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6495        log.debug("Options: " + str(options))
6496
6497        # Data
6498        table_variants = self.get_table_variants()
6499
6500        # Check if not empty
6501        log.debug("Check if not empty")
6502        sql_query_chromosomes = (
6503            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6504        )
6505        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6506            log.info("VCF empty")
6507            return None
6508
6509        # Export in VCF
6510        log.debug("Create initial file to annotate")
6511
6512        # Create output folder / work folder
6513        if options.get("output_folder", ""):
6514            output_folder = options.get("output_folder", "")
6515            if not os.path.exists(output_folder):
6516                Path(output_folder).mkdir(parents=True, exist_ok=True)
6517        else:
6518            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6519            if not os.path.exists(output_folder):
6520                Path(output_folder).mkdir(parents=True, exist_ok=True)
6521
6522        if options.get("workdir", ""):
6523            workdir = options.get("workdir", "")
6524        else:
6525            workdir = "/work"
6526
6527        # Create tmp VCF file
6528        tmp_vcf = NamedTemporaryFile(
6529            prefix=self.get_prefix(),
6530            dir=output_folder,
6531            suffix=".vcf",
6532            delete=False,
6533        )
6534        tmp_vcf_name = tmp_vcf.name
6535
6536        # VCF header
6537        header = self.get_header()
6538
6539        # Existing annotations
6540        for vcf_annotation in self.get_header().infos:
6541
6542            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6543            log.debug(
6544                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6545            )
6546
6547        # Memory limit
6548        if config.get("memory", None):
6549            memory_limit = config.get("memory", "8G").upper()
6550            # upper()
6551        else:
6552            memory_limit = "8G"
6553        log.debug(f"memory_limit: {memory_limit}")
6554
6555        # Check number of variants to annotate
6556        where_clause_regex_spliceai = r"SpliceAI_\w+"
6557        where_clause_regex_spip = r"SPiP_\w+"
6558        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6559        df_list_of_variants_to_annotate = self.get_query_to_df(
6560            query=f""" SELECT * FROM variants {where_clause} """
6561        )
6562        if len(df_list_of_variants_to_annotate) == 0:
6563            log.warning(
6564                f"No variants to annotate with splice. Variants probably already annotated with splice"
6565            )
6566            return None
6567        else:
6568            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6569
6570        # Export VCF file
6571        self.export_variant_vcf(
6572            vcf_file=tmp_vcf_name,
6573            remove_info=True,
6574            add_samples=True,
6575            index=False,
6576            where_clause=where_clause,
6577        )
6578        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6579        if any(value for value in splice_config.values() if value is None):
6580            log.warning("At least one splice config parameter is empty")
6581            # exit annotation_splice
6582            return None
6583
6584        # Params in splice nf
6585        def check_values(dico: dict):
6586            """
6587            Ensure parameters for NF splice pipeline
6588            """
6589            for key, val in dico.items():
6590                if key == "genome":
6591                    if any(
6592                        assemb in options.get("genome", {})
6593                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6594                    ):
6595                        yield f"--{key} hg19"
6596                    elif any(
6597                        assemb in options.get("genome", {})
6598                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6599                    ):
6600                        yield f"--{key} hg38"
6601                elif (
6602                    (isinstance(val, str) and val)
6603                    or isinstance(val, int)
6604                    or isinstance(val, bool)
6605                ):
6606                    yield f"--{key} {val}"
6607
6608        # Genome
6609        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6610        options["genome"] = genome
6611        # NF params
6612        nf_params = []
6613        # Add options
6614        if options:
6615            log.debug(options)
6616            nf_params = list(check_values(options))
6617            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6618        else:
6619            log.debug("No NF params provided")
6620        # Add threads
6621        if "threads" not in options.keys():
6622            nf_params.append(f"--threads {threads}")
6623        # Genome path
6624        genome_path = find_genome(
6625            config.get("folders", {})
6626            .get("databases", {})
6627            .get("genomes", DEFAULT_GENOME_FOLDER),
6628            file=f"{genome}.fa",
6629        )
6630        # Add genome path
6631        if not genome_path:
6632            raise ValueError(
6633                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6634            )
6635        else:
6636            log.debug(f"Genome: {genome_path}")
6637            nf_params.append(f"--genome_path {genome_path}")
6638
6639        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6640            """
6641            Setting up updated databases for SPiP and SpliceAI
6642            """
6643
6644            try:
6645
6646                # SpliceAI assembly transcriptome
6647                spliceai_assembly = os.path.join(
6648                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6649                    options.get("genome"),
6650                    "transcriptome",
6651                )
6652                spip_assembly = options.get("genome")
6653
6654                spip = find(
6655                    f"transcriptome_{spip_assembly}.RData",
6656                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6657                )
6658                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6659                log.debug(f"SPiP annotations: {spip}")
6660                log.debug(f"SpliceAI annotations: {spliceai}")
6661                if spip and spliceai:
6662                    return [
6663                        f"--spip_transcriptome {spip}",
6664                        f"--spliceai_transcriptome {spliceai}",
6665                    ]
6666                else:
6667                    log.warning(
6668                        "Can't find splice databases in configuration, use annotations file from image"
6669                    )
6670            except TypeError:
6671                log.warning(
6672                    "Can't find splice databases in configuration, use annotations file from image"
6673                )
6674                return []
6675
6676        # Add options, check if transcriptome option have already beend provided
6677        if (
6678            "spip_transcriptome" not in nf_params
6679            and "spliceai_transcriptome" not in nf_params
6680        ):
6681            splice_reference = splice_annotations(options, config)
6682            if splice_reference:
6683                nf_params.extend(splice_reference)
6684        # nf_params.append(f"--output_folder {output_folder}")
6685        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6686        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6687        log.debug(cmd)
6688        splice_config["docker"]["command"] = cmd
6689
6690        # Ensure proxy is set
6691        proxy = [
6692            f"-e {var}={os.getenv(var)}"
6693            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6694            if os.getenv(var) is not None
6695        ]
6696        docker_cmd = get_bin_command(
6697            tool="splice",
6698            bin_type="docker",
6699            config=config,
6700            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6701            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6702        )
6703        # print(docker_cmd)
6704        # exit()
6705        # Docker debug
6706        # if splice_config.get("rm_container"):
6707        #     rm_container = "--rm"
6708        # else:
6709        #     rm_container = ""
6710        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6711        log.debug(docker_cmd)
6712        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6713        log.debug(res.stdout)
6714        if res.stderr:
6715            log.error(res.stderr)
6716        res.check_returncode()
6717        # Update variants
6718        log.info("Annotation - Updating...")
6719        # Test find output vcf
6720        log.debug(
6721            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6722        )
6723        output_vcf = []
6724        # Wrong folder to look in
6725        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6726            if (
6727                files
6728                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6729            ):
6730                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6731        # log.debug(os.listdir(options.get("output_folder")))
6732        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6733        if not output_vcf:
6734            log.debug(
6735                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6736            )
6737        else:
6738            # Get new header from annotated vcf
6739            log.debug(f"Initial header: {len(header.infos)} fields")
6740            # Create new header with splice infos
6741            new_vcf = Variants(input=output_vcf[0])
6742            new_vcf_header = new_vcf.get_header().infos
6743            for keys, infos in new_vcf_header.items():
6744                if keys not in header.infos.keys():
6745                    header.infos[keys] = infos
6746            log.debug(f"New header: {len(header.infos)} fields")
6747            log.debug(f"Splice tmp output: {output_vcf[0]}")
6748            self.update_from_vcf(output_vcf[0])
6749
6750        # Remove file
6751        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6757    def get_config_default(self, name: str) -> dict:
6758        """
6759        The function `get_config_default` returns a dictionary containing default configurations for
6760        various calculations and prioritizations.
6761
6762        :param name: The `get_config_default` function returns a dictionary containing default
6763        configurations for different calculations and prioritizations. The `name` parameter is used to
6764        specify which specific configuration to retrieve from the dictionary
6765        :type name: str
6766        :return: The function `get_config_default` returns a dictionary containing default configuration
6767        settings for different calculations and prioritizations. The specific configuration settings are
6768        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6769        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6770        returned. If there is no match, an empty dictionary is returned.
6771        """
6772
6773        config_default = {
6774            "calculations": {
6775                "variant_chr_pos_alt_ref": {
6776                    "type": "sql",
6777                    "name": "variant_chr_pos_alt_ref",
6778                    "description": "Create a variant ID with chromosome, position, alt and ref",
6779                    "available": False,
6780                    "output_column_name": "variant_chr_pos_alt_ref",
6781                    "output_column_type": "String",
6782                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6783                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6784                    "operation_info": True,
6785                },
6786                "VARTYPE": {
6787                    "type": "sql",
6788                    "name": "VARTYPE",
6789                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6790                    "available": True,
6791                    "table": "variants",
6792                    "output_column_name": "VARTYPE",
6793                    "output_column_type": "String",
6794                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6795                    "operation_query": """
6796                            CASE
6797                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6798                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6799                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6800                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6801                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6802                                ELSE 'UNDEFINED'
6803                            END
6804                            """,
6805                    "info_fields": ["SVTYPE"],
6806                    "operation_info": True,
6807                },
6808                "snpeff_hgvs": {
6809                    "type": "python",
6810                    "name": "snpeff_hgvs",
6811                    "description": "HGVS nomenclatures from snpEff annotation",
6812                    "available": True,
6813                    "function_name": "calculation_extract_snpeff_hgvs",
6814                    "function_params": ["snpeff_hgvs", "ANN"],
6815                },
6816                "snpeff_ann_explode": {
6817                    "type": "python",
6818                    "name": "snpeff_ann_explode",
6819                    "description": "Explode snpEff annotations with uniquify values",
6820                    "available": True,
6821                    "function_name": "calculation_snpeff_ann_explode",
6822                    "function_params": [False, "fields", "snpeff_", "ANN"],
6823                },
6824                "snpeff_ann_explode_uniquify": {
6825                    "type": "python",
6826                    "name": "snpeff_ann_explode_uniquify",
6827                    "description": "Explode snpEff annotations",
6828                    "available": True,
6829                    "function_name": "calculation_snpeff_ann_explode",
6830                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6831                },
6832                "snpeff_ann_explode_json": {
6833                    "type": "python",
6834                    "name": "snpeff_ann_explode_json",
6835                    "description": "Explode snpEff annotations in JSON format",
6836                    "available": True,
6837                    "function_name": "calculation_snpeff_ann_explode",
6838                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6839                },
6840                "NOMEN": {
6841                    "type": "python",
6842                    "name": "NOMEN",
6843                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6844                    "available": True,
6845                    "function_name": "calculation_extract_nomen",
6846                    "function_params": [],
6847                },
6848                "RENAME_INFO_FIELDS": {
6849                    "type": "python",
6850                    "name": "RENAME_INFO_FIELDS",
6851                    "description": "Rename or remove INFO/tags",
6852                    "available": True,
6853                    "function_name": "calculation_rename_info_fields",
6854                    "function_params": [],
6855                },
6856                "FINDBYPIPELINE": {
6857                    "type": "python",
6858                    "name": "FINDBYPIPELINE",
6859                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6860                    "available": True,
6861                    "function_name": "calculation_find_by_pipeline",
6862                    "function_params": ["findbypipeline"],
6863                },
6864                "FINDBYSAMPLE": {
6865                    "type": "python",
6866                    "name": "FINDBYSAMPLE",
6867                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6868                    "available": True,
6869                    "function_name": "calculation_find_by_pipeline",
6870                    "function_params": ["findbysample"],
6871                },
6872                "GENOTYPECONCORDANCE": {
6873                    "type": "python",
6874                    "name": "GENOTYPECONCORDANCE",
6875                    "description": "Concordance of genotype for multi caller VCF",
6876                    "available": True,
6877                    "function_name": "calculation_genotype_concordance",
6878                    "function_params": [],
6879                },
6880                "BARCODE": {
6881                    "type": "python",
6882                    "name": "BARCODE",
6883                    "description": "BARCODE as VaRank tool",
6884                    "available": True,
6885                    "function_name": "calculation_barcode",
6886                    "function_params": [],
6887                },
6888                "BARCODEFAMILY": {
6889                    "type": "python",
6890                    "name": "BARCODEFAMILY",
6891                    "description": "BARCODEFAMILY as VaRank tool",
6892                    "available": True,
6893                    "function_name": "calculation_barcode_family",
6894                    "function_params": ["BCF"],
6895                },
6896                "TRIO": {
6897                    "type": "python",
6898                    "name": "TRIO",
6899                    "description": "Inheritance for a trio family",
6900                    "available": True,
6901                    "function_name": "calculation_trio",
6902                    "function_params": [],
6903                },
6904                "VAF": {
6905                    "type": "python",
6906                    "name": "VAF",
6907                    "description": "Variant Allele Frequency (VAF) harmonization",
6908                    "available": True,
6909                    "function_name": "calculation_vaf_normalization",
6910                    "function_params": [],
6911                },
6912                "VAF_stats": {
6913                    "type": "python",
6914                    "name": "VAF_stats",
6915                    "description": "Variant Allele Frequency (VAF) statistics",
6916                    "available": True,
6917                    "function_name": "calculation_genotype_stats",
6918                    "function_params": ["VAF"],
6919                },
6920                "DP_stats": {
6921                    "type": "python",
6922                    "name": "DP_stats",
6923                    "description": "Depth (DP) statistics",
6924                    "available": True,
6925                    "function_name": "calculation_genotype_stats",
6926                    "function_params": ["DP"],
6927                },
6928                "variant_id": {
6929                    "type": "python",
6930                    "name": "variant_id",
6931                    "description": "Variant ID generated from variant position and type",
6932                    "available": True,
6933                    "function_name": "calculation_variant_id",
6934                    "function_params": [],
6935                },
6936                "transcripts_json": {
6937                    "type": "python",
6938                    "name": "transcripts_json",
6939                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6940                    "available": True,
6941                    "function_name": "calculation_transcripts_annotation",
6942                    "function_params": ["transcripts_json", None],
6943                },
6944                "transcripts_ann": {
6945                    "type": "python",
6946                    "name": "transcripts_ann",
6947                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6948                    "available": True,
6949                    "function_name": "calculation_transcripts_annotation",
6950                    "function_params": [None, "transcripts_ann"],
6951                },
6952                "transcripts_annotations": {
6953                    "type": "python",
6954                    "name": "transcripts_annotations",
6955                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6956                    "available": True,
6957                    "function_name": "calculation_transcripts_annotation",
6958                    "function_params": [None, None],
6959                },
6960                "transcripts_prioritization": {
6961                    "type": "python",
6962                    "name": "transcripts_prioritization",
6963                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6964                    "available": True,
6965                    "function_name": "calculation_transcripts_prioritization",
6966                    "function_params": [],
6967                },
6968                "transcripts_export": {
6969                    "type": "python",
6970                    "name": "transcripts_export",
6971                    "description": "Export transcripts table/view as a file (using param.json)",
6972                    "available": True,
6973                    "function_name": "calculation_transcripts_export",
6974                    "function_params": [],
6975                },
6976            },
6977            "prioritizations": {
6978                "default": {
6979                    "ANN2": [
6980                        {
6981                            "type": "contains",
6982                            "value": "HIGH",
6983                            "score": 5,
6984                            "flag": "PASS",
6985                            "comment": [
6986                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6987                            ],
6988                        },
6989                        {
6990                            "type": "contains",
6991                            "value": "MODERATE",
6992                            "score": 3,
6993                            "flag": "PASS",
6994                            "comment": [
6995                                "A non-disruptive variant that might change protein effectiveness"
6996                            ],
6997                        },
6998                        {
6999                            "type": "contains",
7000                            "value": "LOW",
7001                            "score": 0,
7002                            "flag": "FILTERED",
7003                            "comment": [
7004                                "Assumed to be mostly harmless or unlikely to change protein behavior"
7005                            ],
7006                        },
7007                        {
7008                            "type": "contains",
7009                            "value": "MODIFIER",
7010                            "score": 0,
7011                            "flag": "FILTERED",
7012                            "comment": [
7013                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
7014                            ],
7015                        },
7016                    ],
7017                }
7018            },
7019        }
7020
7021        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
7023    def get_config_json(
7024        self, name: str, config_dict: dict = {}, config_file: str = None
7025    ) -> dict:
7026        """
7027        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
7028        default values, a dictionary, and a file.
7029
7030        :param name: The `name` parameter in the `get_config_json` function is a string that represents
7031        the name of the configuration. It is used to identify and retrieve the configuration settings
7032        for a specific component or module
7033        :type name: str
7034        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
7035        dictionary that allows you to provide additional configuration settings or overrides. When you
7036        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
7037        the key is the configuration setting you want to override or
7038        :type config_dict: dict
7039        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
7040        specify the path to a configuration file that contains additional settings. If provided, the
7041        function will read the contents of this file and update the configuration dictionary with the
7042        values found in the file, overriding any existing values with the
7043        :type config_file: str
7044        :return: The function `get_config_json` returns a dictionary containing the configuration
7045        settings.
7046        """
7047
7048        # Create with default prioritizations
7049        config_default = self.get_config_default(name=name)
7050        configuration = config_default
7051        # log.debug(f"configuration={configuration}")
7052
7053        # Replace prioritizations from dict
7054        for config in config_dict:
7055            configuration[config] = config_dict[config]
7056
7057        # Replace prioritizations from file
7058        config_file = full_path(config_file)
7059        if config_file:
7060            if os.path.exists(config_file):
7061                with open(config_file) as config_file_content:
7062                    config_file_dict = yaml.safe_load(config_file_content)
7063                for config in config_file_dict:
7064                    configuration[config] = config_file_dict[config]
7065            else:
7066                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
7067                log.error(msg_error)
7068                raise ValueError(msg_error)
7069
7070        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
7072    def prioritization(
7073        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7074    ) -> bool:
7075        """
7076        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7077        prioritizes variants based on configured profiles and criteria.
7078
7079        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7080        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7081        a table name is provided, the method will prioritize the variants in that specific table
7082        :type table: str
7083        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7084        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7085        provided, the code will use a default prefix value of "PZ"
7086        :type pz_prefix: str
7087        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7088        additional parameters specific to the prioritization process. These parameters can include
7089        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7090        configurations needed for the prioritization of variants in a V
7091        :type pz_param: dict
7092        :return: A boolean value (True) is being returned from the `prioritization` function.
7093        """
7094
7095        # Config
7096        config = self.get_config()
7097
7098        # Param
7099        param = self.get_param()
7100
7101        # Prioritization param
7102        if pz_param is not None:
7103            prioritization_param = pz_param
7104        else:
7105            prioritization_param = param.get("prioritization", {})
7106
7107        # Configuration profiles
7108        prioritization_config_file = prioritization_param.get(
7109            "prioritization_config", None
7110        )
7111        prioritization_config_file = full_path(prioritization_config_file)
7112        prioritizations_config = self.get_config_json(
7113            name="prioritizations", config_file=prioritization_config_file
7114        )
7115
7116        # Prioritization prefix
7117        pz_prefix_default = "PZ"
7118        if pz_prefix is None:
7119            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7120
7121        # Prioritization options
7122        profiles = prioritization_param.get("profiles", [])
7123        if isinstance(profiles, str):
7124            profiles = profiles.split(",")
7125        pzfields = prioritization_param.get(
7126            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7127        )
7128        if isinstance(pzfields, str):
7129            pzfields = pzfields.split(",")
7130        default_profile = prioritization_param.get("default_profile", None)
7131        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7132        prioritization_score_mode = prioritization_param.get(
7133            "prioritization_score_mode", "HOWARD"
7134        )
7135
7136        # Quick Prioritizations
7137        prioritizations = param.get("prioritizations", None)
7138        if prioritizations:
7139            log.info("Quick Prioritization:")
7140            for profile in prioritizations.split(","):
7141                if profile not in profiles:
7142                    profiles.append(profile)
7143                    log.info(f"   {profile}")
7144
7145        # If profile "ALL" provided, all profiles in the config profiles
7146        if "ALL" in profiles:
7147            profiles = list(prioritizations_config.keys())
7148
7149        for profile in profiles:
7150            if prioritizations_config.get(profile, None):
7151                log.debug(f"Profile '{profile}' configured")
7152            else:
7153                msg_error = f"Profile '{profile}' NOT configured"
7154                log.error(msg_error)
7155                raise ValueError(msg_error)
7156
7157        if profiles:
7158            log.info(f"Prioritization... ")
7159        else:
7160            log.debug(f"No profile defined")
7161            return False
7162
7163        if not default_profile and len(profiles):
7164            default_profile = profiles[0]
7165
7166        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7167        log.debug("Profiles to check: " + str(list(profiles)))
7168
7169        # Variables
7170        if table is not None:
7171            table_variants = table
7172        else:
7173            table_variants = self.get_table_variants(clause="update")
7174        log.debug(f"Table to prioritize: {table_variants}")
7175
7176        # Added columns
7177        added_columns = []
7178
7179        # Create list of PZfields
7180        # List of PZFields
7181        list_of_pzfields_original = pzfields + [
7182            pzfield + pzfields_sep + profile
7183            for pzfield in pzfields
7184            for profile in profiles
7185        ]
7186        list_of_pzfields = []
7187        log.debug(f"{list_of_pzfields_original}")
7188
7189        # Remove existing PZfields to use if exists
7190        for pzfield in list_of_pzfields_original:
7191            if self.get_header().infos.get(pzfield, None) is None:
7192                list_of_pzfields.append(pzfield)
7193                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7194            else:
7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7196
7197        if list_of_pzfields:
7198
7199            # Explode Infos prefix
7200            explode_infos_prefix = self.get_explode_infos_prefix()
7201
7202            # PZfields tags description
7203            PZfields_INFOS = {
7204                f"{pz_prefix}Tags": {
7205                    "ID": f"{pz_prefix}Tags",
7206                    "Number": ".",
7207                    "Type": "String",
7208                    "Description": "Variant tags based on annotation criteria",
7209                },
7210                f"{pz_prefix}Score": {
7211                    "ID": f"{pz_prefix}Score",
7212                    "Number": 1,
7213                    "Type": "Integer",
7214                    "Description": "Variant score based on annotation criteria",
7215                },
7216                f"{pz_prefix}Flag": {
7217                    "ID": f"{pz_prefix}Flag",
7218                    "Number": 1,
7219                    "Type": "String",
7220                    "Description": "Variant flag based on annotation criteria",
7221                },
7222                f"{pz_prefix}Comment": {
7223                    "ID": f"{pz_prefix}Comment",
7224                    "Number": ".",
7225                    "Type": "String",
7226                    "Description": "Variant comment based on annotation criteria",
7227                },
7228                f"{pz_prefix}Infos": {
7229                    "ID": f"{pz_prefix}Infos",
7230                    "Number": ".",
7231                    "Type": "String",
7232                    "Description": "Variant infos based on annotation criteria",
7233                },
7234                f"{pz_prefix}Class": {
7235                    "ID": f"{pz_prefix}Class",
7236                    "Number": ".",
7237                    "Type": "String",
7238                    "Description": "Variant class based on annotation criteria",
7239                },
7240            }
7241
7242            # Create INFO fields if not exist
7243            for field in PZfields_INFOS:
7244                field_ID = PZfields_INFOS[field]["ID"]
7245                field_description = PZfields_INFOS[field]["Description"]
7246                if field_ID not in self.get_header().infos and field_ID in pzfields:
7247                    field_description = (
7248                        PZfields_INFOS[field]["Description"]
7249                        + f", profile {default_profile}"
7250                    )
7251                    self.get_header().infos[field_ID] = vcf.parser._Info(
7252                        field_ID,
7253                        PZfields_INFOS[field]["Number"],
7254                        PZfields_INFOS[field]["Type"],
7255                        field_description,
7256                        "unknown",
7257                        "unknown",
7258                        code_type_map[PZfields_INFOS[field]["Type"]],
7259                    )
7260
7261            # Create INFO fields if not exist for each profile
7262            for profile in prioritizations_config:
7263                if profile in profiles or profiles == []:
7264                    for field in PZfields_INFOS:
7265                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7266                        field_description = (
7267                            PZfields_INFOS[field]["Description"]
7268                            + f", profile {profile}"
7269                        )
7270                        if (
7271                            field_ID not in self.get_header().infos
7272                            and field in pzfields
7273                        ):
7274                            self.get_header().infos[field_ID] = vcf.parser._Info(
7275                                field_ID,
7276                                PZfields_INFOS[field]["Number"],
7277                                PZfields_INFOS[field]["Type"],
7278                                field_description,
7279                                "unknown",
7280                                "unknown",
7281                                code_type_map[PZfields_INFOS[field]["Type"]],
7282                            )
7283
7284            # Header
7285            for pzfield in list_of_pzfields:
7286                if re.match(f"{pz_prefix}Score.*", pzfield):
7287                    added_column = self.add_column(
7288                        table_name=table_variants,
7289                        column_name=pzfield,
7290                        column_type="INTEGER",
7291                        default_value="0",
7292                    )
7293                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7294                    added_column = self.add_column(
7295                        table_name=table_variants,
7296                        column_name=pzfield,
7297                        column_type="BOOLEAN",
7298                        default_value="1",
7299                    )
7300                elif re.match(f"{pz_prefix}Class.*", pzfield):
7301                    added_column = self.add_column(
7302                        table_name=table_variants,
7303                        column_name=pzfield,
7304                        column_type="VARCHAR[]",
7305                        default_value="null",
7306                    )
7307                else:
7308                    added_column = self.add_column(
7309                        table_name=table_variants,
7310                        column_name=pzfield,
7311                        column_type="STRING",
7312                        default_value="''",
7313                    )
7314                added_columns.append(added_column)
7315
7316            # Profiles
7317            if profiles:
7318
7319                # foreach profile in configuration file
7320                for profile in prioritizations_config:
7321
7322                    # If profile is asked in param, or ALL are asked (empty profile [])
7323                    if profile in profiles or profiles == []:
7324                        log.info(f"Profile '{profile}'")
7325
7326                        sql_set_info_option = ""
7327
7328                        sql_set_info = []
7329
7330                        # PZ fields set
7331
7332                        # PZScore
7333                        if (
7334                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7335                            in list_of_pzfields
7336                        ):
7337                            sql_set_info.append(
7338                                f"""
7339                                    concat(
7340                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7341                                        {pz_prefix}Score{pzfields_sep}{profile}
7342                                    ) 
7343                                """
7344                            )
7345                            if (
7346                                profile == default_profile
7347                                and f"{pz_prefix}Score" in list_of_pzfields
7348                            ):
7349                                sql_set_info.append(
7350                                    f"""
7351                                        concat(
7352                                            '{pz_prefix}Score=',
7353                                            {pz_prefix}Score{pzfields_sep}{profile}
7354                                        )
7355                                    """
7356                                )
7357
7358                        # PZFlag
7359                        if (
7360                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7361                            in list_of_pzfields
7362                        ):
7363                            sql_set_info.append(
7364                                f"""
7365                                    concat(
7366                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7367                                        CASE 
7368                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7369                                            THEN 'PASS'
7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7371                                            THEN 'FILTERED'
7372                                        END
7373                                    ) 
7374                                """
7375                            )
7376                            if (
7377                                profile == default_profile
7378                                and f"{pz_prefix}Flag" in list_of_pzfields
7379                            ):
7380                                sql_set_info.append(
7381                                    f"""
7382                                        concat(
7383                                            '{pz_prefix}Flag=',
7384                                            CASE 
7385                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7386                                                THEN 'PASS'
7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7388                                                THEN 'FILTERED'
7389                                            END
7390                                        )
7391                                    """
7392                                )
7393
7394                        # PZClass
7395                        if (
7396                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7397                            in list_of_pzfields
7398                        ):
7399                            sql_set_info.append(
7400                                f"""
7401                                    concat(
7402                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7403                                        CASE
7404                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7405                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7406                                            ELSE '.'
7407                                        END 
7408                                    )
7409                                    
7410                                """
7411                            )
7412                            if (
7413                                profile == default_profile
7414                                and f"{pz_prefix}Class" in list_of_pzfields
7415                            ):
7416                                sql_set_info.append(
7417                                    f"""
7418                                        concat(
7419                                            '{pz_prefix}Class=',
7420                                            CASE
7421                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7422                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7423                                                ELSE '.'
7424                                            END 
7425                                        )
7426                                    """
7427                                )
7428
7429                        # PZComment
7430                        if (
7431                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7432                            in list_of_pzfields
7433                        ):
7434                            sql_set_info.append(
7435                                f"""
7436                                    CASE
7437                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7438                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7439                                        ELSE ''
7440                                    END
7441                                """
7442                            )
7443                            if (
7444                                profile == default_profile
7445                                and f"{pz_prefix}Comment" in list_of_pzfields
7446                            ):
7447                                sql_set_info.append(
7448                                    f"""
7449                                        CASE
7450                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7451                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7452                                            ELSE ''
7453                                        END
7454                                    """
7455                                )
7456
7457                        # PZInfos
7458                        if (
7459                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7460                            in list_of_pzfields
7461                        ):
7462                            sql_set_info.append(
7463                                f"""
7464                                    CASE
7465                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7466                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7467                                        ELSE ''
7468                                    END
7469                                """
7470                            )
7471                            if (
7472                                profile == default_profile
7473                                and f"{pz_prefix}Infos" in list_of_pzfields
7474                            ):
7475                                sql_set_info.append(
7476                                    f"""
7477                                        CASE
7478                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7479                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7480                                            ELSE ''
7481                                        END
7482                                    """
7483                                )
7484
7485                        # Merge PZfields
7486                        sql_set_info_option = ""
7487                        sql_set_sep = ""
7488                        for sql_set in sql_set_info:
7489                            if sql_set_sep:
7490                                sql_set_info_option += f"""
7491                                    , concat('{sql_set_sep}', {sql_set})
7492                                """
7493                            else:
7494                                sql_set_info_option += f"""
7495                                    , {sql_set}
7496                                """
7497                            sql_set_sep = ";"
7498
7499                        sql_queries = []
7500                        for annotation in prioritizations_config[profile]:
7501
7502                            # skip special sections
7503                            if annotation.startswith("_"):
7504                                continue
7505
7506                            # For each criterions
7507                            for criterion in prioritizations_config[profile][
7508                                annotation
7509                            ]:
7510
7511                                # Criterion mode
7512                                criterion_mode = None
7513                                if np.any(
7514                                    np.isin(list(criterion.keys()), ["type", "value"])
7515                                ):
7516                                    criterion_mode = "operation"
7517                                elif np.any(
7518                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7519                                ):
7520                                    criterion_mode = "sql"
7521                                log.debug(f"Criterion Mode: {criterion_mode}")
7522
7523                                # Criterion parameters
7524                                criterion_type = criterion.get("type", None)
7525                                criterion_value = criterion.get("value", None)
7526                                criterion_sql = criterion.get("sql", None)
7527                                criterion_fields = criterion.get("fields", None)
7528                                criterion_score = criterion.get("score", 0)
7529                                criterion_flag = criterion.get("flag", "PASS")
7530                                criterion_class = criterion.get("class", None)
7531                                criterion_flag_bool = criterion_flag == "PASS"
7532                                criterion_comment = (
7533                                    ", ".join(criterion.get("comment", []))
7534                                    .replace("'", "''")
7535                                    .replace(";", ",")
7536                                    .replace("\t", " ")
7537                                )
7538                                criterion_infos = (
7539                                    str(criterion)
7540                                    .replace("'", "''")
7541                                    .replace(";", ",")
7542                                    .replace("\t", " ")
7543                                )
7544
7545                                # SQL
7546                                if criterion_sql is not None and isinstance(
7547                                    criterion_sql, list
7548                                ):
7549                                    criterion_sql = " ".join(criterion_sql)
7550
7551                                # Fields and explode
7552                                if criterion_fields is None:
7553                                    criterion_fields = [annotation]
7554                                if not isinstance(criterion_fields, list):
7555                                    criterion_fields = str(criterion_fields).split(",")
7556
7557                                # Class
7558                                if criterion_class is not None and not isinstance(
7559                                    criterion_class, list
7560                                ):
7561                                    criterion_class = str(criterion_class).split(",")
7562
7563                                for annotation_field in criterion_fields:
7564
7565                                    # Explode specific annotation
7566                                    log.debug(
7567                                        f"Explode annotation '{annotation_field}'"
7568                                    )
7569                                    added_columns += self.explode_infos(
7570                                        prefix=explode_infos_prefix,
7571                                        fields=[annotation_field],
7572                                        table=table_variants,
7573                                    )
7574                                    extra_infos = self.get_extra_infos(
7575                                        table=table_variants
7576                                    )
7577
7578                                    # Check if annotation field is present
7579                                    if (
7580                                        f"{explode_infos_prefix}{annotation_field}"
7581                                        not in extra_infos
7582                                    ):
7583                                        msq_err = f"Annotation '{annotation_field}' not in data"
7584                                        log.error(msq_err)
7585                                        raise ValueError(msq_err)
7586                                    else:
7587                                        log.debug(
7588                                            f"Annotation '{annotation_field}' in data"
7589                                        )
7590
7591                                sql_set = []
7592                                sql_set_info = []
7593
7594                                # PZ fields set
7595
7596                                # PZScore
7597                                if (
7598                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7599                                    in list_of_pzfields
7600                                ):
7601                                    # VaRank prioritization score mode
7602                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
7603                                        sql_set.append(
7604                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
7605                                        )
7606                                    # default HOWARD prioritization score mode
7607                                    else:
7608                                        sql_set.append(
7609                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7610                                        )
7611
7612                                # PZFlag
7613                                if (
7614                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7615                                    in list_of_pzfields
7616                                ):
7617                                    sql_set.append(
7618                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7619                                    )
7620
7621                                # PZClass
7622                                if (
7623                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7624                                    in list_of_pzfields
7625                                    and criterion_class is not None
7626                                ):
7627                                    sql_set.append(
7628                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7629                                    )
7630
7631                                # PZComment
7632                                if (
7633                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7634                                    in list_of_pzfields
7635                                ):
7636                                    sql_set.append(
7637                                        f"""
7638                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7639                                                concat(
7640                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7641                                                    CASE 
7642                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7643                                                        THEN ', '
7644                                                        ELSE ''
7645                                                    END,
7646                                                    '{criterion_comment}'
7647                                                )
7648                                        """
7649                                    )
7650
7651                                # PZInfos
7652                                if (
7653                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7654                                    in list_of_pzfields
7655                                ):
7656                                    sql_set.append(
7657                                        f"""
7658                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7659                                                concat(
7660                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7661                                                    '{criterion_infos}'
7662                                                )
7663                                        """
7664                                    )
7665                                sql_set_option = ",".join(sql_set)
7666
7667                                # Criterion and comparison
7668                                if sql_set_option:
7669
7670                                    if criterion_mode in ["operation"]:
7671
7672                                        try:
7673                                            float(criterion_value)
7674                                            sql_update = f"""
7675                                                UPDATE {table_variants}
7676                                                SET {sql_set_option}
7677                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7678                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7679                                            """
7680                                        except:
7681                                            contains_option = ""
7682                                            if criterion_type == "contains":
7683                                                contains_option = ".*"
7684                                            sql_update = f"""
7685                                                UPDATE {table_variants}
7686                                                SET {sql_set_option}
7687                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7688                                            """
7689                                        sql_queries.append(sql_update)
7690
7691                                    elif criterion_mode in ["sql"]:
7692
7693                                        sql_update = f"""
7694                                            UPDATE {table_variants}
7695                                            SET {sql_set_option}
7696                                            WHERE {criterion_sql}
7697                                        """
7698                                        sql_queries.append(sql_update)
7699
7700                                    else:
7701                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7702                                        log.error(msg_err)
7703                                        raise ValueError(msg_err)
7704
7705                                else:
7706                                    log.warning(
7707                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7708                                    )
7709
7710                        # PZTags
7711                        if (
7712                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7713                            in list_of_pzfields
7714                        ):
7715
7716                            # Create PZFalgs value
7717                            pztags_value = ""
7718                            pztags_sep_default = ","
7719                            pztags_sep = ""
7720                            for pzfield in pzfields:
7721                                if pzfield not in [f"{pz_prefix}Tags"]:
7722                                    if (
7723                                        f"{pzfield}{pzfields_sep}{profile}"
7724                                        in list_of_pzfields
7725                                    ):
7726                                        if pzfield in [f"{pz_prefix}Flag"]:
7727                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7728                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7729                                                    THEN 'PASS'
7730                                                    ELSE 'FILTERED'
7731                                                END, '"""
7732                                        elif pzfield in [f"{pz_prefix}Class"]:
7733                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7734                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7735                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7736                                                    ELSE '.'
7737                                                END, '"""
7738                                        else:
7739                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7740                                        pztags_sep = pztags_sep_default
7741
7742                            # Add Query update for PZFlags
7743                            sql_update_pztags = f"""
7744                                UPDATE {table_variants}
7745                                SET INFO = concat(
7746                                        INFO,
7747                                        CASE WHEN INFO NOT in ('','.')
7748                                                THEN ';'
7749                                                ELSE ''
7750                                        END,
7751                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7752                                    )
7753                                """
7754                            sql_queries.append(sql_update_pztags)
7755
7756                            # Add Query update for PZFlags for default
7757                            if profile == default_profile:
7758                                sql_update_pztags_default = f"""
7759                                UPDATE {table_variants}
7760                                SET INFO = concat(
7761                                        INFO,
7762                                        ';',
7763                                        '{pz_prefix}Tags={pztags_value}'
7764                                    )
7765                                """
7766                                sql_queries.append(sql_update_pztags_default)
7767
7768                        log.info(f"""Profile '{profile}' - Prioritization... """)
7769
7770                        if sql_queries:
7771
7772                            for sql_query in sql_queries:
7773                                log.debug(
7774                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7775                                )
7776                                self.conn.execute(sql_query)
7777
7778                        log.info(f"""Profile '{profile}' - Update... """)
7779                        sql_query_update = f"""
7780                            UPDATE {table_variants}
7781                            SET INFO =  
7782                                concat(
7783                                    CASE
7784                                        WHEN INFO NOT IN ('','.')
7785                                        THEN concat(INFO, ';')
7786                                        ELSE ''
7787                                    END
7788                                    {sql_set_info_option}
7789                                )
7790                        """
7791                        self.conn.execute(sql_query_update)
7792
7793        else:
7794
7795            log.warning(f"No profiles in parameters")
7796
7797        # Remove added columns
7798        for added_column in added_columns:
7799            self.drop_column(column=added_column)
7800
7801        # Explode INFOS fields into table fields
7802        if self.get_explode_infos():
7803            self.explode_infos(
7804                prefix=self.get_explode_infos_prefix(),
7805                fields=self.get_explode_infos_fields(),
7806                force=True,
7807            )
7808
7809        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7815    def annotation_hgvs(self, threads: int = None) -> None:
7816        """
7817        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7818        coordinates and alleles.
7819
7820        :param threads: The `threads` parameter is an optional integer that specifies the number of
7821        threads to use for parallel processing. If no value is provided, it will default to the number
7822        of threads obtained from the `get_threads()` method
7823        :type threads: int
7824        """
7825
7826        # Function for each partition of the Dask Dataframe
7827        def partition_function(partition):
7828            """
7829            The function `partition_function` applies the `annotation_hgvs_partition` function to
7830            each row of a DataFrame called `partition`.
7831
7832            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7833            to be processed
7834            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7835            the "partition" dataframe along the axis 1.
7836            """
7837            return partition.apply(annotation_hgvs_partition, axis=1)
7838
7839        def annotation_hgvs_partition(row) -> str:
7840            """
7841            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7842            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7843
7844            :param row: A dictionary-like object that contains the values for the following keys:
7845            :return: a string that contains the HGVS names associated with the given row of data.
7846            """
7847
7848            chr = row["CHROM"]
7849            pos = row["POS"]
7850            ref = row["REF"]
7851            alt = row["ALT"]
7852
7853            # Find list of associated transcripts
7854            transcripts_list = list(
7855                polars_conn.execute(
7856                    f"""
7857                SELECT transcript
7858                FROM refseq_df
7859                WHERE CHROM='{chr}'
7860                AND POS={pos}
7861            """
7862                )["transcript"]
7863            )
7864
7865            # Full HGVS annotation in list
7866            hgvs_full_list = []
7867
7868            for transcript_name in transcripts_list:
7869
7870                # Transcript
7871                transcript = get_transcript(
7872                    transcripts=transcripts, transcript_name=transcript_name
7873                )
7874                # Exon
7875                if use_exon:
7876                    exon = transcript.find_exon_number(pos)
7877                else:
7878                    exon = None
7879                # Protein
7880                transcript_protein = None
7881                if use_protein or add_protein or full_format:
7882                    transcripts_protein = list(
7883                        polars_conn.execute(
7884                            f"""
7885                        SELECT protein
7886                        FROM refseqlink_df
7887                        WHERE transcript='{transcript_name}'
7888                        LIMIT 1
7889                    """
7890                        )["protein"]
7891                    )
7892                    if len(transcripts_protein):
7893                        transcript_protein = transcripts_protein[0]
7894
7895                # HGVS name
7896                hgvs_name = format_hgvs_name(
7897                    chr,
7898                    pos,
7899                    ref,
7900                    alt,
7901                    genome=genome,
7902                    transcript=transcript,
7903                    transcript_protein=transcript_protein,
7904                    exon=exon,
7905                    use_gene=use_gene,
7906                    use_protein=use_protein,
7907                    full_format=full_format,
7908                    use_version=use_version,
7909                    codon_type=codon_type,
7910                )
7911                hgvs_full_list.append(hgvs_name)
7912                if add_protein and not use_protein and not full_format:
7913                    hgvs_name = format_hgvs_name(
7914                        chr,
7915                        pos,
7916                        ref,
7917                        alt,
7918                        genome=genome,
7919                        transcript=transcript,
7920                        transcript_protein=transcript_protein,
7921                        exon=exon,
7922                        use_gene=use_gene,
7923                        use_protein=True,
7924                        full_format=False,
7925                        use_version=use_version,
7926                        codon_type=codon_type,
7927                    )
7928                    hgvs_full_list.append(hgvs_name)
7929
7930            # Create liste of HGVS annotations
7931            hgvs_full = ",".join(hgvs_full_list)
7932
7933            return hgvs_full
7934
7935        # Polars connexion
7936        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7937
7938        # Config
7939        config = self.get_config()
7940
7941        # Databases
7942        # Genome
7943        databases_genomes_folders = (
7944            config.get("folders", {})
7945            .get("databases", {})
7946            .get("genomes", DEFAULT_GENOME_FOLDER)
7947        )
7948        databases_genome = (
7949            config.get("folders", {}).get("databases", {}).get("genomes", "")
7950        )
7951        # refseq database folder
7952        databases_refseq_folders = (
7953            config.get("folders", {})
7954            .get("databases", {})
7955            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7956        )
7957        # refseq
7958        databases_refseq = config.get("databases", {}).get("refSeq", None)
7959        # refSeqLink
7960        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7961
7962        # Param
7963        param = self.get_param()
7964
7965        # Quick HGVS
7966        if "hgvs_options" in param and param.get("hgvs_options", ""):
7967            log.info(f"Quick HGVS Annotation:")
7968            if not param.get("hgvs", None):
7969                param["hgvs"] = {}
7970            for option in param.get("hgvs_options", "").split(","):
7971                option_var_val = option.split("=")
7972                option_var = option_var_val[0]
7973                if len(option_var_val) > 1:
7974                    option_val = option_var_val[1]
7975                else:
7976                    option_val = "True"
7977                if option_val.upper() in ["TRUE"]:
7978                    option_val = True
7979                elif option_val.upper() in ["FALSE"]:
7980                    option_val = False
7981                log.info(f"   {option_var}={option_val}")
7982                param["hgvs"][option_var] = option_val
7983
7984        # Check if HGVS annotation enabled
7985        if "hgvs" in param:
7986            log.info(f"HGVS Annotation... ")
7987            for hgvs_option in param.get("hgvs", {}):
7988                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7989        else:
7990            return
7991
7992        # HGVS Param
7993        param_hgvs = param.get("hgvs", {})
7994        use_exon = param_hgvs.get("use_exon", False)
7995        use_gene = param_hgvs.get("use_gene", False)
7996        use_protein = param_hgvs.get("use_protein", False)
7997        add_protein = param_hgvs.get("add_protein", False)
7998        full_format = param_hgvs.get("full_format", False)
7999        use_version = param_hgvs.get("use_version", False)
8000        codon_type = param_hgvs.get("codon_type", "3")
8001
8002        # refSseq refSeqLink
8003        databases_refseq = param_hgvs.get("refseq", databases_refseq)
8004        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
8005
8006        # Assembly
8007        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
8008
8009        # Genome
8010        genome_file = None
8011        if find_genome(databases_genome):
8012            genome_file = find_genome(databases_genome)
8013        else:
8014            genome_file = find_genome(
8015                genome_path=databases_genomes_folders, assembly=assembly
8016            )
8017        log.debug("Genome: " + str(genome_file))
8018
8019        # refSseq
8020        refseq_file = find_file_prefix(
8021            input_file=databases_refseq,
8022            prefix="ncbiRefSeq",
8023            folder=databases_refseq_folders,
8024            assembly=assembly,
8025        )
8026        log.debug("refSeq: " + str(refseq_file))
8027
8028        # refSeqLink
8029        refseqlink_file = find_file_prefix(
8030            input_file=databases_refseqlink,
8031            prefix="ncbiRefSeqLink",
8032            folder=databases_refseq_folders,
8033            assembly=assembly,
8034        )
8035        log.debug("refSeqLink: " + str(refseqlink_file))
8036
8037        # Threads
8038        if not threads:
8039            threads = self.get_threads()
8040        log.debug("Threads: " + str(threads))
8041
8042        # Variables
8043        table_variants = self.get_table_variants(clause="update")
8044
8045        # Get variants SNV and InDel only
8046        query_variants = f"""
8047            SELECT "#CHROM" AS CHROM, POS, REF, ALT
8048            FROM {table_variants}
8049            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
8050            """
8051        df_variants = self.get_query_to_df(query_variants)
8052
8053        # Added columns
8054        added_columns = []
8055
8056        # Add hgvs column in variants table
8057        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
8058        added_column = self.add_column(
8059            table_variants, hgvs_column_name, "STRING", default_value=None
8060        )
8061        added_columns.append(added_column)
8062
8063        log.debug(f"refSeq loading...")
8064        # refSeq in duckDB
8065        refseq_table = get_refseq_table(
8066            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
8067        )
8068        # Loading all refSeq in Dataframe
8069        refseq_query = f"""
8070            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8071            FROM {refseq_table}
8072            JOIN df_variants ON (
8073                {refseq_table}.chrom = df_variants.CHROM
8074                AND {refseq_table}.txStart<=df_variants.POS
8075                AND {refseq_table}.txEnd>=df_variants.POS
8076            )
8077        """
8078        refseq_df = self.conn.query(refseq_query).pl()
8079
8080        if refseqlink_file:
8081            log.debug(f"refSeqLink loading...")
8082            # refSeqLink in duckDB
8083            refseqlink_table = get_refseq_table(
8084                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8085            )
8086            # Loading all refSeqLink in Dataframe
8087            protacc_column = "protAcc_with_ver"
8088            mrnaacc_column = "mrnaAcc_with_ver"
8089            refseqlink_query = f"""
8090                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8091                FROM {refseqlink_table} 
8092                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8093                WHERE protAcc_without_ver IS NOT NULL
8094            """
8095            # Polars Dataframe
8096            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8097
8098        # Read RefSeq transcripts into a python dict/model.
8099        log.debug(f"Transcripts loading...")
8100        with tempfile.TemporaryDirectory() as tmpdir:
8101            transcripts_query = f"""
8102                COPY (
8103                    SELECT {refseq_table}.*
8104                    FROM {refseq_table}
8105                    JOIN df_variants ON (
8106                        {refseq_table}.chrom=df_variants.CHROM
8107                        AND {refseq_table}.txStart<=df_variants.POS
8108                        AND {refseq_table}.txEnd>=df_variants.POS
8109                    )
8110                )
8111                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8112            """
8113            self.conn.query(transcripts_query)
8114            with open(f"{tmpdir}/transcript.tsv") as infile:
8115                transcripts = read_transcripts(infile)
8116
8117        # Polars connexion
8118        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8119
8120        log.debug("Genome loading...")
8121        # Read genome sequence using pyfaidx.
8122        genome = Fasta(genome_file)
8123
8124        log.debug("Start annotation HGVS...")
8125
8126        # Create
8127        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8128        ddf = dd.from_pandas(df_variants, npartitions=threads)
8129
8130        # Use dask.dataframe.apply() to apply function on each partition
8131        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8132
8133        # Convert Dask DataFrame to Pandas Dataframe
8134        df = ddf.compute()
8135
8136        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8137        with tempfile.TemporaryDirectory() as tmpdir:
8138            df_parquet = os.path.join(tmpdir, "df.parquet")
8139            df.to_parquet(df_parquet)
8140
8141            # Update hgvs column
8142            update_variant_query = f"""
8143                UPDATE {table_variants}
8144                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8145                FROM read_parquet('{df_parquet}') as df
8146                WHERE variants."#CHROM" = df.CHROM
8147                AND variants.POS = df.POS
8148                AND variants.REF = df.REF
8149                AND variants.ALT = df.ALT
8150                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8151                """
8152            self.execute_query(update_variant_query)
8153
8154        # Update INFO column
8155        sql_query_update = f"""
8156            UPDATE {table_variants}
8157            SET INFO = 
8158                concat(
8159                    CASE 
8160                        WHEN INFO NOT IN ('','.')
8161                        THEN concat(INFO, ';')
8162                        ELSE ''
8163                    END,
8164                    'hgvs=',
8165                    {hgvs_column_name}
8166                )
8167            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8168            """
8169        self.execute_query(sql_query_update)
8170
8171        # Add header
8172        HGVS_INFOS = {
8173            "hgvs": {
8174                "ID": "hgvs",
8175                "Number": ".",
8176                "Type": "String",
8177                "Description": f"HGVS annotatation with HOWARD",
8178            }
8179        }
8180
8181        for field in HGVS_INFOS:
8182            field_ID = HGVS_INFOS[field]["ID"]
8183            field_description = HGVS_INFOS[field]["Description"]
8184            self.get_header().infos[field_ID] = vcf.parser._Info(
8185                field_ID,
8186                HGVS_INFOS[field]["Number"],
8187                HGVS_INFOS[field]["Type"],
8188                field_description,
8189                "unknown",
8190                "unknown",
8191                code_type_map[HGVS_INFOS[field]["Type"]],
8192            )
8193
8194        # Remove added columns
8195        for added_column in added_columns:
8196            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8202    def get_operations_help(
8203        self, operations_config_dict: dict = {}, operations_config_file: str = None
8204    ) -> list:
8205
8206        # Init
8207        operations_help = []
8208
8209        # operations
8210        operations = self.get_config_json(
8211            name="calculations",
8212            config_dict=operations_config_dict,
8213            config_file=operations_config_file,
8214        )
8215        for op in operations:
8216            op_name = operations[op].get("name", op).upper()
8217            op_description = operations[op].get("description", op_name)
8218            op_available = operations[op].get("available", False)
8219            if op_available:
8220                operations_help.append(f"   {op_name}: {op_description}")
8221
8222        # Sort operations
8223        operations_help.sort()
8224
8225        # insert header
8226        operations_help.insert(0, "Available calculation operations:")
8227
8228        # Return
8229        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8231    def calculation(
8232        self,
8233        operations: dict = {},
8234        operations_config_dict: dict = {},
8235        operations_config_file: str = None,
8236    ) -> None:
8237        """
8238        It takes a list of operations, and for each operation, it checks if it's a python or sql
8239        operation, and then calls the appropriate function
8240
8241        param json example:
8242            "calculation": {
8243                "NOMEN": {
8244                    "options": {
8245                        "hgvs_field": "hgvs"
8246                    },
8247                "middle" : null
8248            }
8249        """
8250
8251        # Param
8252        param = self.get_param()
8253
8254        # CHeck operations config file
8255        if operations_config_file is None:
8256            operations_config_file = param.get("calculation", {}).get(
8257                "calculation_config", None
8258            )
8259
8260        # operations config
8261        operations_config = self.get_config_json(
8262            name="calculations",
8263            config_dict=operations_config_dict,
8264            config_file=operations_config_file,
8265        )
8266
8267        # Upper keys
8268        operations_config = {k.upper(): v for k, v in operations_config.items()}
8269
8270        # Calculations
8271
8272        # Operations from param
8273        operations = param.get("calculation", {}).get("calculations", operations)
8274
8275        # Quick calculation - add
8276        if param.get("calculations", None):
8277
8278            # List of operations
8279            calculations_list = [
8280                value.strip() for value in param.get("calculations", "").split(",")
8281            ]
8282
8283            # Log
8284            log.info(f"Quick Calculations:")
8285            for calculation_key in calculations_list:
8286                log.info(f"   {calculation_key}")
8287
8288            # Create tmp operations (to keep operation order)
8289            operations_tmp = {}
8290            for calculation_operation in calculations_list:
8291                if calculation_operation.upper() not in operations_tmp:
8292                    log.debug(
8293                        f"{calculation_operation}.upper() not in {operations_tmp}"
8294                    )
8295                    operations_tmp[calculation_operation.upper()] = {}
8296                    add_value_into_dict(
8297                        dict_tree=operations_tmp,
8298                        sections=[
8299                            calculation_operation.upper(),
8300                        ],
8301                        value=operations.get(calculation_operation.upper(), {}),
8302                    )
8303            # Add operations already in param
8304            for calculation_operation in operations:
8305                if calculation_operation not in operations_tmp:
8306                    operations_tmp[calculation_operation] = operations.get(
8307                        calculation_operation, {}
8308                    )
8309
8310            # Update operations in param
8311            operations = operations_tmp
8312
8313        # Operations for calculation
8314        if not operations:
8315            operations = param.get("calculation", {}).get("calculations", {})
8316
8317        if operations:
8318            log.info(f"Calculations...")
8319
8320        # For each operations
8321        for operation_name in operations:
8322            operation_name = operation_name.upper()
8323            if operation_name not in [""]:
8324                if operation_name in operations_config:
8325                    log.info(f"Calculation '{operation_name}'")
8326                    operation = operations_config[operation_name]
8327                    operation_type = operation.get("type", "sql")
8328                    if operation_type == "python":
8329                        self.calculation_process_function(
8330                            operation=operation, operation_name=operation_name
8331                        )
8332                    elif operation_type == "sql":
8333                        self.calculation_process_sql(
8334                            operation=operation, operation_name=operation_name
8335                        )
8336                    else:
8337                        log.error(
8338                            f"Operations config: Type '{operation_type}' NOT available"
8339                        )
8340                        raise ValueError(
8341                            f"Operations config: Type '{operation_type}' NOT available"
8342                        )
8343                else:
8344                    log.error(
8345                        f"Operations config: Calculation '{operation_name}' NOT available"
8346                    )
8347                    raise ValueError(
8348                        f"Operations config: Calculation '{operation_name}' NOT available"
8349                    )
8350
8351        # Explode INFOS fields into table fields
8352        if self.get_explode_infos():
8353            self.explode_infos(
8354                prefix=self.get_explode_infos_prefix(),
8355                fields=self.get_explode_infos_fields(),
8356                force=True,
8357            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8359    def calculation_process_sql(
8360        self, operation: dict, operation_name: str = "unknown"
8361    ) -> None:
8362        """
8363        The `calculation_process_sql` function takes in a mathematical operation as a string and
8364        performs the operation, updating the specified table with the result.
8365
8366        :param operation: The `operation` parameter is a dictionary that contains information about the
8367        mathematical operation to be performed. It includes the following keys:
8368        :type operation: dict
8369        :param operation_name: The `operation_name` parameter is a string that represents the name of
8370        the mathematical operation being performed. It is used for logging and error handling purposes,
8371        defaults to unknown
8372        :type operation_name: str (optional)
8373        """
8374
8375        # Operation infos
8376        operation_name = operation.get("name", "unknown")
8377        log.debug(f"process SQL {operation_name}")
8378        output_column_name = operation.get("output_column_name", operation_name)
8379        output_column_type = operation.get("output_column_type", "String")
8380        prefix = operation.get("explode_infos_prefix", "")
8381        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8382        output_column_description = operation.get(
8383            "output_column_description", f"{operation_name} operation"
8384        )
8385        operation_query = operation.get("operation_query", None)
8386        if isinstance(operation_query, list):
8387            operation_query = " ".join(operation_query)
8388        operation_info_fields = operation.get("info_fields", [])
8389        operation_info_fields_check = operation.get("info_fields_check", False)
8390        operation_info = operation.get("operation_info", True)
8391        operation_table = operation.get(
8392            "table", self.get_table_variants(clause="alter")
8393        )
8394
8395        # table variants
8396        if operation_table:
8397            table_variants = operation_table
8398        else:
8399            table_variants = self.get_table_variants(clause="alter")
8400
8401        if operation_query:
8402
8403            # Info fields check
8404            operation_info_fields_check_result = True
8405            if operation_info_fields_check:
8406                header_infos = self.get_header().infos
8407                for info_field in operation_info_fields:
8408                    operation_info_fields_check_result = (
8409                        operation_info_fields_check_result
8410                        and info_field in header_infos
8411                    )
8412
8413            # If info fields available
8414            if operation_info_fields_check_result:
8415
8416                # Added_columns
8417                added_columns = []
8418
8419                # Create VCF header field
8420                vcf_reader = self.get_header()
8421                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8422                    output_column_name,
8423                    ".",
8424                    output_column_type,
8425                    output_column_description,
8426                    "howard calculation",
8427                    "0",
8428                    self.code_type_map.get(output_column_type),
8429                )
8430
8431                # Explode infos if needed
8432                log.debug(f"calculation_process_sql prefix {prefix}")
8433                added_columns += self.explode_infos(
8434                    prefix=prefix,
8435                    fields=[output_column_name] + operation_info_fields,
8436                    force=False,
8437                    table=table_variants,
8438                )
8439
8440                # Create column
8441                added_column = self.add_column(
8442                    table_name=table_variants,
8443                    column_name=prefix + output_column_name,
8444                    column_type=output_column_type_sql,
8445                    default_value="null",
8446                )
8447                added_columns.append(added_column)
8448
8449                # Operation calculation
8450                try:
8451
8452                    # Query to update calculation column
8453                    sql_update = f"""
8454                        UPDATE {table_variants}
8455                        SET "{prefix}{output_column_name}" = ({operation_query})
8456                    """
8457                    self.conn.execute(sql_update)
8458
8459                    # Add to INFO
8460                    if operation_info:
8461                        sql_update_info = f"""
8462                            UPDATE {table_variants}
8463                            SET "INFO" =
8464                                concat(
8465                                    CASE
8466                                        WHEN "INFO" IS NOT NULL
8467                                        THEN concat("INFO", ';')
8468                                        ELSE ''
8469                                    END,
8470                                    '{output_column_name}=',
8471                                    "{prefix}{output_column_name}"
8472                                )
8473                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8474                        """
8475                        self.conn.execute(sql_update_info)
8476
8477                except:
8478                    log.error(
8479                        f"Operations config: Calculation '{operation_name}' query failed"
8480                    )
8481                    raise ValueError(
8482                        f"Operations config: Calculation '{operation_name}' query failed"
8483                    )
8484
8485                # Remove added columns
8486                for added_column in added_columns:
8487                    log.debug(f"added_column: {added_column}")
8488                    self.drop_column(column=added_column)
8489
8490            else:
8491                log.error(
8492                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8493                )
8494                raise ValueError(
8495                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8496                )
8497
8498        else:
8499            log.error(
8500                f"Operations config: Calculation '{operation_name}' query NOT defined"
8501            )
8502            raise ValueError(
8503                f"Operations config: Calculation '{operation_name}' query NOT defined"
8504            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8506    def calculation_process_function(
8507        self, operation: dict, operation_name: str = "unknown"
8508    ) -> None:
8509        """
8510        The `calculation_process_function` takes in an operation dictionary and performs the specified
8511        function with the given parameters.
8512
8513        :param operation: The `operation` parameter is a dictionary that contains information about the
8514        operation to be performed. It has the following keys:
8515        :type operation: dict
8516        :param operation_name: The `operation_name` parameter is a string that represents the name of
8517        the operation being performed. It is used for logging purposes, defaults to unknown
8518        :type operation_name: str (optional)
8519        """
8520
8521        operation_name = operation["name"]
8522        log.debug(f"process Python {operation_name}")
8523        function_name = operation["function_name"]
8524        function_params = operation["function_params"]
8525        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8527    def calculation_variant_id(self) -> None:
8528        """
8529        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8530        updates the INFO field of a variants table with the variant ID.
8531        """
8532
8533        # variant_id annotation field
8534        variant_id_tag = self.get_variant_id_column()
8535        added_columns = [variant_id_tag]
8536
8537        # variant_id hgvs tags"
8538        vcf_infos_tags = {
8539            variant_id_tag: "howard variant ID annotation",
8540        }
8541
8542        # Variants table
8543        table_variants = self.get_table_variants()
8544
8545        # Header
8546        vcf_reader = self.get_header()
8547
8548        # Add variant_id to header
8549        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8550            variant_id_tag,
8551            ".",
8552            "String",
8553            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8554            "howard calculation",
8555            "0",
8556            self.code_type_map.get("String"),
8557        )
8558
8559        # Update
8560        sql_update = f"""
8561            UPDATE {table_variants}
8562            SET "INFO" = 
8563                concat(
8564                    CASE
8565                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8566                        THEN ''
8567                        ELSE concat("INFO", ';')
8568                    END,
8569                    '{variant_id_tag}=',
8570                    "{variant_id_tag}"
8571                )
8572        """
8573        self.conn.execute(sql_update)
8574
8575        # Remove added columns
8576        for added_column in added_columns:
8577            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8579    def calculation_extract_snpeff_hgvs(
8580        self,
8581        snpeff_hgvs: str = "snpeff_hgvs",
8582        snpeff_field: str = "ANN",
8583    ) -> None:
8584        """
8585        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8586        annotation field in a VCF file and adds them as a new column in the variants table.
8587
8588        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8589        function is used to specify the name of the column that will store the HGVS nomenclatures
8590        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8591        snpeff_hgvs
8592        :type snpeff_hgvs: str (optional)
8593        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8594        function represents the field in the VCF file that contains SnpEff annotations. This field is
8595        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8596        to ANN
8597        :type snpeff_field: str (optional)
8598        """
8599
8600        # Snpeff hgvs tags
8601        vcf_infos_tags = {
8602            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8603        }
8604
8605        # Prefix
8606        prefix = self.get_explode_infos_prefix()
8607        if prefix:
8608            prefix = "INFO/"
8609
8610        # snpEff fields
8611        speff_ann_infos = prefix + snpeff_field
8612        speff_hgvs_infos = prefix + snpeff_hgvs
8613
8614        # Variants table
8615        table_variants = self.get_table_variants()
8616
8617        # Header
8618        vcf_reader = self.get_header()
8619
8620        # Add columns
8621        added_columns = []
8622
8623        # Explode HGVS field in column
8624        added_columns += self.explode_infos(fields=[snpeff_field])
8625
8626        if snpeff_field in vcf_reader.infos:
8627
8628            log.debug(vcf_reader.infos[snpeff_field])
8629
8630            # Extract ANN header
8631            ann_description = vcf_reader.infos[snpeff_field].desc
8632            pattern = r"'(.+?)'"
8633            match = re.search(pattern, ann_description)
8634            if match:
8635                ann_header_match = match.group(1).split(" | ")
8636                ann_header_desc = {}
8637                for i in range(len(ann_header_match)):
8638                    ann_header_info = "".join(
8639                        char for char in ann_header_match[i] if char.isalnum()
8640                    )
8641                    ann_header_desc[ann_header_info] = ann_header_match[i]
8642                if not ann_header_desc:
8643                    raise ValueError("Invalid header description format")
8644            else:
8645                raise ValueError("Invalid header description format")
8646
8647            # Create variant id
8648            variant_id_column = self.get_variant_id_column()
8649            added_columns += [variant_id_column]
8650
8651            # Create dataframe
8652            dataframe_snpeff_hgvs = self.get_query_to_df(
8653                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8654            )
8655
8656            # Create main NOMEN column
8657            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8658                speff_ann_infos
8659            ].apply(
8660                lambda x: extract_snpeff_hgvs(
8661                    str(x), header=list(ann_header_desc.values())
8662                )
8663            )
8664
8665            # Add snpeff_hgvs to header
8666            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8667                snpeff_hgvs,
8668                ".",
8669                "String",
8670                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8671                "howard calculation",
8672                "0",
8673                self.code_type_map.get("String"),
8674            )
8675
8676            # Update
8677            sql_update = f"""
8678                UPDATE variants
8679                SET "INFO" = 
8680                    concat(
8681                        CASE
8682                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8683                            THEN ''
8684                            ELSE concat("INFO", ';')
8685                        END,
8686                        CASE 
8687                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8688                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8689                            THEN concat(
8690                                    '{snpeff_hgvs}=',
8691                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8692                                )
8693                            ELSE ''
8694                        END
8695                    )
8696                FROM dataframe_snpeff_hgvs
8697                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8698
8699            """
8700            self.conn.execute(sql_update)
8701
8702            # Delete dataframe
8703            del dataframe_snpeff_hgvs
8704            gc.collect()
8705
8706        else:
8707
8708            log.warning(
8709                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8710            )
8711
8712        # Remove added columns
8713        for added_column in added_columns:
8714            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8716    def calculation_snpeff_ann_explode(
8717        self,
8718        uniquify: bool = True,
8719        output_format: str = "fields",
8720        output_prefix: str = "snpeff_",
8721        snpeff_field: str = "ANN",
8722    ) -> None:
8723        """
8724        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8725        exploding the HGVS field and updating variant information accordingly.
8726
8727        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8728        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8729        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8730        defaults to True
8731        :type uniquify: bool (optional)
8732        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8733        function specifies the format in which the output annotations will be generated. It has a
8734        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8735        format, defaults to fields
8736        :type output_format: str (optional)
8737        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8738        method is used to specify the prefix that will be added to the output annotations generated
8739        during the calculation process. This prefix helps to differentiate the newly added annotations
8740        from existing ones in the output data. By default, the, defaults to ANN_
8741        :type output_prefix: str (optional)
8742        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8743        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8744        field will be processed to explode the HGVS annotations and update the variant information
8745        accordingly, defaults to ANN
8746        :type snpeff_field: str (optional)
8747        """
8748
8749        # SnpEff annotation field
8750        snpeff_hgvs = "snpeff_ann_explode"
8751
8752        # Snpeff hgvs tags
8753        vcf_infos_tags = {
8754            snpeff_hgvs: "Explode snpEff annotations",
8755        }
8756
8757        # Prefix
8758        prefix = self.get_explode_infos_prefix()
8759        if prefix:
8760            prefix = "INFO/"
8761
8762        # snpEff fields
8763        speff_ann_infos = prefix + snpeff_field
8764        speff_hgvs_infos = prefix + snpeff_hgvs
8765
8766        # Variants table
8767        table_variants = self.get_table_variants()
8768
8769        # Header
8770        vcf_reader = self.get_header()
8771
8772        # Add columns
8773        added_columns = []
8774
8775        # Explode HGVS field in column
8776        added_columns += self.explode_infos(fields=[snpeff_field])
8777        log.debug(f"snpeff_field={snpeff_field}")
8778        log.debug(f"added_columns={added_columns}")
8779
8780        if snpeff_field in vcf_reader.infos:
8781
8782            # Extract ANN header
8783            ann_description = vcf_reader.infos[snpeff_field].desc
8784            pattern = r"'(.+?)'"
8785            match = re.search(pattern, ann_description)
8786            if match:
8787                ann_header_match = match.group(1).split(" | ")
8788                ann_header = []
8789                ann_header_desc = {}
8790                for i in range(len(ann_header_match)):
8791                    ann_header_info = "".join(
8792                        char for char in ann_header_match[i] if char.isalnum()
8793                    )
8794                    ann_header.append(ann_header_info)
8795                    ann_header_desc[ann_header_info] = ann_header_match[i]
8796                if not ann_header_desc:
8797                    raise ValueError("Invalid header description format")
8798            else:
8799                raise ValueError("Invalid header description format")
8800
8801            # Create variant id
8802            variant_id_column = self.get_variant_id_column()
8803            added_columns += [variant_id_column]
8804
8805            # Create dataframe
8806            dataframe_snpeff_hgvs = self.get_query_to_df(
8807                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8808            )
8809
8810            # Create snpEff columns
8811            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8812                speff_ann_infos
8813            ].apply(
8814                lambda x: explode_snpeff_ann(
8815                    str(x),
8816                    uniquify=uniquify,
8817                    output_format=output_format,
8818                    prefix=output_prefix,
8819                    header=list(ann_header_desc.values()),
8820                )
8821            )
8822
8823            # Header
8824            ann_annotations_prefix = ""
8825            if output_format.upper() in ["JSON"]:
8826                ann_annotations_prefix = f"{output_prefix}="
8827                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8828                    output_prefix,
8829                    ".",
8830                    "String",
8831                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8832                    + " - JSON format",
8833                    "howard calculation",
8834                    "0",
8835                    self.code_type_map.get("String"),
8836                )
8837            else:
8838                for ann_annotation in ann_header:
8839                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8840                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8841                        ann_annotation_id,
8842                        ".",
8843                        "String",
8844                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8845                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8846                        "howard calculation",
8847                        "0",
8848                        self.code_type_map.get("String"),
8849                    )
8850
8851            # Update
8852            sql_update = f"""
8853                UPDATE variants
8854                SET "INFO" = 
8855                    concat(
8856                        CASE
8857                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8858                            THEN ''
8859                            ELSE concat("INFO", ';')
8860                        END,
8861                        CASE 
8862                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8863                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8864                            THEN concat(
8865                                '{ann_annotations_prefix}',
8866                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8867                                )
8868                            ELSE ''
8869                        END
8870                    )
8871                FROM dataframe_snpeff_hgvs
8872                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8873
8874            """
8875            self.conn.execute(sql_update)
8876
8877            # Delete dataframe
8878            del dataframe_snpeff_hgvs
8879            gc.collect()
8880
8881        else:
8882
8883            log.warning(
8884                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8885            )
8886
8887        # Remove added columns
8888        for added_column in added_columns:
8889            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8891    def calculation_extract_nomen(self) -> None:
8892        """
8893        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8894        """
8895
8896        # NOMEN field
8897        field_nomen_dict = "NOMEN_DICT"
8898
8899        # NOMEN structure
8900        nomen_dict = {
8901            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8902            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8903            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8904            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8905            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8906            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8907            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8908            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8909            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8910            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8911        }
8912
8913        # Param
8914        param = self.get_param()
8915
8916        # Threads
8917        threads = self.get_threads()
8918
8919        # Prefix
8920        prefix = self.get_explode_infos_prefix()
8921
8922        # Header
8923        vcf_reader = self.get_header()
8924
8925        # Added columns
8926        added_columns = []
8927
8928        # Get HGVS field
8929        hgvs_field = (
8930            param.get("calculation", {})
8931            .get("calculations", {})
8932            .get("NOMEN", {})
8933            .get("options", {})
8934            .get("hgvs_field", "hgvs")
8935        )
8936
8937        # Get NOMEN pattern
8938        nomen_pattern = (
8939            param.get("calculation", {})
8940            .get("calculations", {})
8941            .get("NOMEN", {})
8942            .get("options", {})
8943            .get("pattern", None)
8944        )
8945
8946        # transcripts list of preference sources
8947        transcripts_sources = {}
8948
8949        # Get transcripts
8950        transcripts_file = (
8951            param.get("calculation", {})
8952            .get("calculations", {})
8953            .get("NOMEN", {})
8954            .get("options", {})
8955            .get("transcripts", None)
8956        )
8957        transcripts_file = full_path(transcripts_file)
8958        if transcripts_file:
8959            if os.path.exists(transcripts_file):
8960                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8961                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
8962                transcripts_sources["file"] = transcripts_from_file
8963            else:
8964                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
8965                log.error(msg_err)
8966                raise ValueError(msg_err)
8967
8968        # Get transcripts table
8969        transcripts_table = (
8970            param.get("calculation", {})
8971            .get("calculations", {})
8972            .get("NOMEN", {})
8973            .get("options", {})
8974            .get("transcripts_table", self.get_table_variants())
8975        )
8976        # Get transcripts column
8977        transcripts_column = (
8978            param.get("calculation", {})
8979            .get("calculations", {})
8980            .get("NOMEN", {})
8981            .get("options", {})
8982            .get("transcripts_column", None)
8983        )
8984
8985        if transcripts_table and transcripts_column:
8986            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
8987            # Explode if not exists
8988            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
8989        else:
8990            extra_field_transcript = f"NULL"
8991
8992        # Transcripts of preference source order
8993        transcripts_order = (
8994            param.get("calculation", {})
8995            .get("calculations", {})
8996            .get("NOMEN", {})
8997            .get("options", {})
8998            .get("transcripts_order", ["column", "file"])
8999        )
9000
9001        # Transcripts from file
9002        transcripts = transcripts_sources.get("file", [])
9003
9004        # Explode HGVS field in column
9005        added_columns += self.explode_infos(fields=[hgvs_field])
9006
9007        # extra infos
9008        extra_infos = self.get_extra_infos()
9009        extra_field = prefix + hgvs_field
9010
9011        if extra_field in extra_infos:
9012
9013            # Create dataframe
9014            dataframe_hgvs = self.get_query_to_df(
9015                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
9016            )
9017
9018            # Transcripts rank
9019            transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)}
9020            transcripts_len = len(transcripts_rank)
9021
9022            # Create main NOMEN column
9023            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
9024                lambda x: find_nomen(
9025                    hgvs=x.hgvs,
9026                    transcript=x.transcript,
9027                    transcripts=transcripts_rank,
9028                    pattern=nomen_pattern,
9029                    transcripts_source_order=transcripts_order,
9030                    transcripts_len=transcripts_len
9031                ),
9032                axis=1,
9033            )
9034
9035            # Explode NOMEN Structure and create SQL set for update
9036            sql_nomen_fields = []
9037            for nomen_field in nomen_dict:
9038
9039                # Create VCF header field
9040                vcf_reader.infos[nomen_field] = vcf.parser._Info(
9041                    nomen_field,
9042                    ".",
9043                    "String",
9044                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
9045                    "howard calculation",
9046                    "0",
9047                    self.code_type_map.get("String"),
9048                )
9049
9050                # Add field to SQL query update
9051                sql_nomen_fields.append(
9052                    f"""
9053                        CASE 
9054                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
9055                            THEN concat(
9056                                    ';{nomen_field}=',
9057                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
9058                                )
9059                            ELSE ''
9060                        END
9061                    """
9062                )
9063
9064            # SQL set for update
9065            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
9066
9067            # Update
9068            sql_update = f"""
9069                UPDATE variants
9070                SET "INFO" = 
9071                    concat(
9072                        CASE
9073                            WHEN "INFO" IS NULL
9074                            THEN ''
9075                            ELSE "INFO"
9076                        END,
9077                        {sql_nomen_fields_set}
9078                    )
9079                FROM dataframe_hgvs
9080                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
9081                    AND variants."POS" = dataframe_hgvs."POS" 
9082                    AND variants."REF" = dataframe_hgvs."REF"
9083                    AND variants."ALT" = dataframe_hgvs."ALT"
9084            """
9085            self.conn.execute(sql_update)
9086
9087            # Delete dataframe
9088            del dataframe_hgvs
9089            gc.collect()
9090
9091        # Remove added columns
9092        for added_column in added_columns:
9093            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9095    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9096        """
9097        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9098        pipeline/sample for a variant and updates the variant information in a VCF file.
9099
9100        :param tag: The `tag` parameter is a string that represents the annotation field for the
9101        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9102        VCF header and to update the corresponding field in the variants table, defaults to
9103        findbypipeline
9104        :type tag: str (optional)
9105        """
9106
9107        # if FORMAT and samples
9108        if (
9109            "FORMAT" in self.get_header_columns_as_list()
9110            and self.get_header_sample_list()
9111        ):
9112
9113            # findbypipeline annotation field
9114            findbypipeline_tag = tag
9115
9116            # VCF infos tags
9117            vcf_infos_tags = {
9118                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9119            }
9120
9121            # Prefix
9122            prefix = self.get_explode_infos_prefix()
9123
9124            # Field
9125            findbypipeline_infos = prefix + findbypipeline_tag
9126
9127            # Variants table
9128            table_variants = self.get_table_variants()
9129
9130            # Header
9131            vcf_reader = self.get_header()
9132
9133            # Create variant id
9134            variant_id_column = self.get_variant_id_column()
9135            added_columns = [variant_id_column]
9136
9137            # variant_id, FORMAT and samples
9138            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9139                self.get_header_sample_list()
9140            )
9141
9142            # Create dataframe
9143            dataframe_findbypipeline = self.get_query_to_df(
9144                f""" SELECT {samples_fields} FROM {table_variants} """
9145            )
9146
9147            # Create findbypipeline column
9148            dataframe_findbypipeline[findbypipeline_infos] = (
9149                dataframe_findbypipeline.apply(
9150                    lambda row: findbypipeline(
9151                        row, samples=self.get_header_sample_list()
9152                    ),
9153                    axis=1,
9154                )
9155            )
9156
9157            # Add snpeff_hgvs to header
9158            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9159                findbypipeline_tag,
9160                ".",
9161                "String",
9162                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9163                "howard calculation",
9164                "0",
9165                self.code_type_map.get("String"),
9166            )
9167
9168            # Update
9169            sql_update = f"""
9170                UPDATE variants
9171                SET "INFO" = 
9172                    concat(
9173                        CASE
9174                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9175                            THEN ''
9176                            ELSE concat("INFO", ';')
9177                        END,
9178                        CASE 
9179                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9180                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9181                            THEN concat(
9182                                    '{findbypipeline_tag}=',
9183                                    dataframe_findbypipeline."{findbypipeline_infos}"
9184                                )
9185                            ELSE ''
9186                        END
9187                    )
9188                FROM dataframe_findbypipeline
9189                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9190            """
9191            self.conn.execute(sql_update)
9192
9193            # Remove added columns
9194            for added_column in added_columns:
9195                self.drop_column(column=added_column)
9196
9197            # Delete dataframe
9198            del dataframe_findbypipeline
9199            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9201    def calculation_genotype_concordance(self) -> None:
9202        """
9203        The function `calculation_genotype_concordance` calculates the genotype concordance for
9204        multi-caller VCF files and updates the variant information in the database.
9205        """
9206
9207        # if FORMAT and samples
9208        if (
9209            "FORMAT" in self.get_header_columns_as_list()
9210            and self.get_header_sample_list()
9211        ):
9212
9213            # genotypeconcordance annotation field
9214            genotypeconcordance_tag = "genotypeconcordance"
9215
9216            # VCF infos tags
9217            vcf_infos_tags = {
9218                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9219            }
9220
9221            # Prefix
9222            prefix = self.get_explode_infos_prefix()
9223
9224            # Field
9225            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9226
9227            # Variants table
9228            table_variants = self.get_table_variants()
9229
9230            # Header
9231            vcf_reader = self.get_header()
9232
9233            # Create variant id
9234            variant_id_column = self.get_variant_id_column()
9235            added_columns = [variant_id_column]
9236
9237            # variant_id, FORMAT and samples
9238            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9239                self.get_header_sample_list()
9240            )
9241
9242            # Create dataframe
9243            dataframe_genotypeconcordance = self.get_query_to_df(
9244                f""" SELECT {samples_fields} FROM {table_variants} """
9245            )
9246
9247            # Create genotypeconcordance column
9248            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9249                dataframe_genotypeconcordance.apply(
9250                    lambda row: genotypeconcordance(
9251                        row, samples=self.get_header_sample_list()
9252                    ),
9253                    axis=1,
9254                )
9255            )
9256
9257            # Add genotypeconcordance to header
9258            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9259                genotypeconcordance_tag,
9260                ".",
9261                "String",
9262                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9263                "howard calculation",
9264                "0",
9265                self.code_type_map.get("String"),
9266            )
9267
9268            # Update
9269            sql_update = f"""
9270                UPDATE variants
9271                SET "INFO" = 
9272                    concat(
9273                        CASE
9274                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9275                            THEN ''
9276                            ELSE concat("INFO", ';')
9277                        END,
9278                        CASE
9279                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9280                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9281                            THEN concat(
9282                                    '{genotypeconcordance_tag}=',
9283                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9284                                )
9285                            ELSE ''
9286                        END
9287                    )
9288                FROM dataframe_genotypeconcordance
9289                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9290            """
9291            self.conn.execute(sql_update)
9292
9293            # Remove added columns
9294            for added_column in added_columns:
9295                self.drop_column(column=added_column)
9296
9297            # Delete dataframe
9298            del dataframe_genotypeconcordance
9299            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9301    def calculation_barcode(self, tag: str = "barcode") -> None:
9302        """
9303        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9304        updates the INFO field in the file with the calculated barcode values.
9305
9306        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9307        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9308        the default tag name is set to "barcode", defaults to barcode
9309        :type tag: str (optional)
9310        """
9311
9312        # if FORMAT and samples
9313        if (
9314            "FORMAT" in self.get_header_columns_as_list()
9315            and self.get_header_sample_list()
9316        ):
9317
9318            # barcode annotation field
9319            if not tag:
9320                tag = "barcode"
9321
9322            # VCF infos tags
9323            vcf_infos_tags = {
9324                tag: "barcode calculation (VaRank)",
9325            }
9326
9327            # Prefix
9328            prefix = self.get_explode_infos_prefix()
9329
9330            # Field
9331            barcode_infos = prefix + tag
9332
9333            # Variants table
9334            table_variants = self.get_table_variants()
9335
9336            # Header
9337            vcf_reader = self.get_header()
9338
9339            # Create variant id
9340            variant_id_column = self.get_variant_id_column()
9341            added_columns = [variant_id_column]
9342
9343            # variant_id, FORMAT and samples
9344            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9345                self.get_header_sample_list()
9346            )
9347
9348            # Create dataframe
9349            dataframe_barcode = self.get_query_to_df(
9350                f""" SELECT {samples_fields} FROM {table_variants} """
9351            )
9352
9353            # Create barcode column
9354            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9355                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9356            )
9357
9358            # Add barcode to header
9359            vcf_reader.infos[tag] = vcf.parser._Info(
9360                tag,
9361                ".",
9362                "String",
9363                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9364                "howard calculation",
9365                "0",
9366                self.code_type_map.get("String"),
9367            )
9368
9369            # Update
9370            sql_update = f"""
9371                UPDATE {table_variants}
9372                SET "INFO" = 
9373                    concat(
9374                        CASE
9375                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9376                            THEN ''
9377                            ELSE concat("INFO", ';')
9378                        END,
9379                        CASE
9380                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9381                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9382                            THEN concat(
9383                                    '{tag}=',
9384                                    dataframe_barcode."{barcode_infos}"
9385                                )
9386                            ELSE ''
9387                        END
9388                    )
9389                FROM dataframe_barcode
9390                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9391            """
9392            self.conn.execute(sql_update)
9393
9394            # Remove added columns
9395            for added_column in added_columns:
9396                self.drop_column(column=added_column)
9397
9398            # Delete dataframe
9399            del dataframe_barcode
9400            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9402    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9403        """
9404        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9405        and updates the INFO field in the file with the calculated barcode values.
9406
9407        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9408        the barcode tag that will be added to the VCF file during the calculation process. If no value
9409        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9410        :type tag: str (optional)
9411        """
9412
9413        # if FORMAT and samples
9414        if (
9415            "FORMAT" in self.get_header_columns_as_list()
9416            and self.get_header_sample_list()
9417        ):
9418
9419            # barcode annotation field
9420            if not tag:
9421                tag = "BCF"
9422
9423            # VCF infos tags
9424            vcf_infos_tags = {
9425                tag: "barcode family calculation",
9426                f"{tag}S": "barcode family samples",
9427            }
9428
9429            # Param
9430            param = self.get_param()
9431            log.debug(f"param={param}")
9432
9433            # Prefix
9434            prefix = self.get_explode_infos_prefix()
9435
9436            # PED param
9437            ped = (
9438                param.get("calculation", {})
9439                .get("calculations", {})
9440                .get("BARCODEFAMILY", {})
9441                .get("family_pedigree", None)
9442            )
9443            log.debug(f"ped={ped}")
9444
9445            # Load PED
9446            if ped:
9447
9448                # Pedigree is a file
9449                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9450                    log.debug("Pedigree is file")
9451                    with open(full_path(ped)) as ped:
9452                        ped = yaml.safe_load(ped)
9453
9454                # Pedigree is a string
9455                elif isinstance(ped, str):
9456                    log.debug("Pedigree is str")
9457                    try:
9458                        ped = json.loads(ped)
9459                        log.debug("Pedigree is json str")
9460                    except ValueError as e:
9461                        ped_samples = ped.split(",")
9462                        ped = {}
9463                        for ped_sample in ped_samples:
9464                            ped[ped_sample] = ped_sample
9465
9466                # Pedigree is a dict
9467                elif isinstance(ped, dict):
9468                    log.debug("Pedigree is dict")
9469
9470                # Pedigree is not well formatted
9471                else:
9472                    msg_error = "Pedigree not well formatted"
9473                    log.error(msg_error)
9474                    raise ValueError(msg_error)
9475
9476                # Construct list
9477                ped_samples = list(ped.values())
9478
9479            else:
9480                log.debug("Pedigree not defined. Take all samples")
9481                ped_samples = self.get_header_sample_list()
9482                ped = {}
9483                for ped_sample in ped_samples:
9484                    ped[ped_sample] = ped_sample
9485
9486            # Check pedigree
9487            if not ped or len(ped) == 0:
9488                msg_error = f"Error in pedigree: samples {ped_samples}"
9489                log.error(msg_error)
9490                raise ValueError(msg_error)
9491
9492            # Log
9493            log.info(
9494                "Calculation 'BARCODEFAMILY' - Samples: "
9495                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9496            )
9497            log.debug(f"ped_samples={ped_samples}")
9498
9499            # Field
9500            barcode_infos = prefix + tag
9501
9502            # Variants table
9503            table_variants = self.get_table_variants()
9504
9505            # Header
9506            vcf_reader = self.get_header()
9507
9508            # Create variant id
9509            variant_id_column = self.get_variant_id_column()
9510            added_columns = [variant_id_column]
9511
9512            # variant_id, FORMAT and samples
9513            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9514                ped_samples
9515            )
9516
9517            # Create dataframe
9518            dataframe_barcode = self.get_query_to_df(
9519                f""" SELECT {samples_fields} FROM {table_variants} """
9520            )
9521
9522            # Create barcode column
9523            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9524                lambda row: barcode(row, samples=ped_samples), axis=1
9525            )
9526
9527            # Add barcode family to header
9528            # Add vaf_normalization to header
9529            vcf_reader.formats[tag] = vcf.parser._Format(
9530                id=tag,
9531                num=".",
9532                type="String",
9533                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9534                type_code=self.code_type_map.get("String"),
9535            )
9536            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9537                id=f"{tag}S",
9538                num=".",
9539                type="String",
9540                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9541                type_code=self.code_type_map.get("String"),
9542            )
9543
9544            # Update
9545            # for sample in ped_samples:
9546            sql_update_set = []
9547            for sample in self.get_header_sample_list() + ["FORMAT"]:
9548                if sample in ped_samples:
9549                    value = f'dataframe_barcode."{barcode_infos}"'
9550                    value_samples = "'" + ",".join(ped_samples) + "'"
9551                elif sample == "FORMAT":
9552                    value = f"'{tag}'"
9553                    value_samples = f"'{tag}S'"
9554                else:
9555                    value = "'.'"
9556                    value_samples = "'.'"
9557                format_regex = r"[a-zA-Z0-9\s]"
9558                sql_update_set.append(
9559                    f"""
9560                        "{sample}" = 
9561                        concat(
9562                            CASE
9563                                WHEN {table_variants}."{sample}" = './.'
9564                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9565                                ELSE {table_variants}."{sample}"
9566                            END,
9567                            ':',
9568                            {value},
9569                            ':',
9570                            {value_samples}
9571                        )
9572                    """
9573                )
9574
9575            sql_update_set_join = ", ".join(sql_update_set)
9576            sql_update = f"""
9577                UPDATE {table_variants}
9578                SET {sql_update_set_join}
9579                FROM dataframe_barcode
9580                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9581            """
9582            self.conn.execute(sql_update)
9583
9584            # Remove added columns
9585            for added_column in added_columns:
9586                self.drop_column(column=added_column)
9587
9588            # Delete dataframe
9589            del dataframe_barcode
9590            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9592    def calculation_trio(self) -> None:
9593        """
9594        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9595        information to the INFO field of each variant.
9596        """
9597
9598        # if FORMAT and samples
9599        if (
9600            "FORMAT" in self.get_header_columns_as_list()
9601            and self.get_header_sample_list()
9602        ):
9603
9604            # trio annotation field
9605            trio_tag = "trio"
9606
9607            # VCF infos tags
9608            vcf_infos_tags = {
9609                "trio": "trio calculation",
9610            }
9611
9612            # Param
9613            param = self.get_param()
9614
9615            # Prefix
9616            prefix = self.get_explode_infos_prefix()
9617
9618            # Trio param
9619            trio_ped = (
9620                param.get("calculation", {})
9621                .get("calculations", {})
9622                .get("TRIO", {})
9623                .get("trio_pedigree", None)
9624            )
9625
9626            # Load trio
9627            if trio_ped:
9628
9629                # Trio pedigree is a file
9630                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9631                    log.debug("TRIO pedigree is file")
9632                    with open(full_path(trio_ped)) as trio_ped:
9633                        trio_ped = yaml.safe_load(trio_ped)
9634
9635                # Trio pedigree is a string
9636                elif isinstance(trio_ped, str):
9637                    log.debug("TRIO pedigree is str")
9638                    try:
9639                        trio_ped = json.loads(trio_ped)
9640                        log.debug("TRIO pedigree is json str")
9641                    except ValueError as e:
9642                        trio_samples = trio_ped.split(",")
9643                        if len(trio_samples) == 3:
9644                            trio_ped = {
9645                                "father": trio_samples[0],
9646                                "mother": trio_samples[1],
9647                                "child": trio_samples[2],
9648                            }
9649                            log.debug("TRIO pedigree is list str")
9650                        else:
9651                            msg_error = "TRIO pedigree not well formatted"
9652                            log.error(msg_error)
9653                            raise ValueError(msg_error)
9654
9655                # Trio pedigree is a dict
9656                elif isinstance(trio_ped, dict):
9657                    log.debug("TRIO pedigree is dict")
9658
9659                # Trio pedigree is not well formatted
9660                else:
9661                    msg_error = "TRIO pedigree not well formatted"
9662                    log.error(msg_error)
9663                    raise ValueError(msg_error)
9664
9665                # Construct trio list
9666                trio_samples = [
9667                    trio_ped.get("father", ""),
9668                    trio_ped.get("mother", ""),
9669                    trio_ped.get("child", ""),
9670                ]
9671
9672            else:
9673                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9674                samples_list = self.get_header_sample_list()
9675                if len(samples_list) >= 3:
9676                    trio_samples = self.get_header_sample_list()[0:3]
9677                    trio_ped = {
9678                        "father": trio_samples[0],
9679                        "mother": trio_samples[1],
9680                        "child": trio_samples[2],
9681                    }
9682                else:
9683                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9684                    log.error(msg_error)
9685                    raise ValueError(msg_error)
9686
9687            # Check trio pedigree
9688            if not trio_ped or len(trio_ped) != 3:
9689                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9690                log.error(msg_error)
9691                raise ValueError(msg_error)
9692
9693            # Log
9694            log.info(
9695                f"Calculation 'TRIO' - Samples: "
9696                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9697            )
9698
9699            # Field
9700            trio_infos = prefix + trio_tag
9701
9702            # Variants table
9703            table_variants = self.get_table_variants()
9704
9705            # Header
9706            vcf_reader = self.get_header()
9707
9708            # Create variant id
9709            variant_id_column = self.get_variant_id_column()
9710            added_columns = [variant_id_column]
9711
9712            # variant_id, FORMAT and samples
9713            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9714                self.get_header_sample_list()
9715            )
9716
9717            # Create dataframe
9718            dataframe_trio = self.get_query_to_df(
9719                f""" SELECT {samples_fields} FROM {table_variants} """
9720            )
9721
9722            # Create trio column
9723            dataframe_trio[trio_infos] = dataframe_trio.apply(
9724                lambda row: trio(row, samples=trio_samples), axis=1
9725            )
9726
9727            # Add trio to header
9728            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9729                trio_tag,
9730                ".",
9731                "String",
9732                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9733                "howard calculation",
9734                "0",
9735                self.code_type_map.get("String"),
9736            )
9737
9738            # Update
9739            sql_update = f"""
9740                UPDATE {table_variants}
9741                SET "INFO" = 
9742                    concat(
9743                        CASE
9744                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9745                            THEN ''
9746                            ELSE concat("INFO", ';')
9747                        END,
9748                        CASE
9749                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9750                             AND dataframe_trio."{trio_infos}" NOT NULL
9751                            THEN concat(
9752                                    '{trio_tag}=',
9753                                    dataframe_trio."{trio_infos}"
9754                                )
9755                            ELSE ''
9756                        END
9757                    )
9758                FROM dataframe_trio
9759                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9760            """
9761            self.conn.execute(sql_update)
9762
9763            # Remove added columns
9764            for added_column in added_columns:
9765                self.drop_column(column=added_column)
9766
9767            # Delete dataframe
9768            del dataframe_trio
9769            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9771    def calculation_vaf_normalization(self) -> None:
9772        """
9773        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9774        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9775        :return: The function does not return anything.
9776        """
9777
9778        # if FORMAT and samples
9779        if (
9780            "FORMAT" in self.get_header_columns_as_list()
9781            and self.get_header_sample_list()
9782        ):
9783
9784            # vaf_normalization annotation field
9785            vaf_normalization_tag = "VAF"
9786
9787            # VCF infos tags
9788            vcf_infos_tags = {
9789                "VAF": "VAF Variant Frequency",
9790            }
9791
9792            # Prefix
9793            prefix = self.get_explode_infos_prefix()
9794
9795            # Variants table
9796            table_variants = self.get_table_variants()
9797
9798            # Header
9799            vcf_reader = self.get_header()
9800
9801            # Do not calculate if VAF already exists
9802            if "VAF" in vcf_reader.formats:
9803                log.debug("VAF already on genotypes")
9804                return
9805
9806            # Create variant id
9807            variant_id_column = self.get_variant_id_column()
9808            added_columns = [variant_id_column]
9809
9810            # variant_id, FORMAT and samples
9811            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9812                f""" "{sample}" """ for sample in self.get_header_sample_list()
9813            )
9814
9815            # Create dataframe
9816            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9817            log.debug(f"query={query}")
9818            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9819
9820            vaf_normalization_set = []
9821
9822            # for each sample vaf_normalization
9823            for sample in self.get_header_sample_list():
9824                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9825                    lambda row: vaf_normalization(row, sample=sample), axis=1
9826                )
9827                vaf_normalization_set.append(
9828                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9829                )
9830
9831            # Add VAF to FORMAT
9832            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9833                "FORMAT"
9834            ].apply(lambda x: str(x) + ":VAF")
9835            vaf_normalization_set.append(
9836                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9837            )
9838
9839            # Add vaf_normalization to header
9840            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9841                id=vaf_normalization_tag,
9842                num="1",
9843                type="Float",
9844                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9845                type_code=self.code_type_map.get("Float"),
9846            )
9847
9848            # Create fields to add in INFO
9849            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9850
9851            # Update
9852            sql_update = f"""
9853                UPDATE {table_variants}
9854                SET {sql_vaf_normalization_set}
9855                FROM dataframe_vaf_normalization
9856                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9857
9858            """
9859            self.conn.execute(sql_update)
9860
9861            # Remove added columns
9862            for added_column in added_columns:
9863                self.drop_column(column=added_column)
9864
9865            # Delete dataframe
9866            del dataframe_vaf_normalization
9867            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
 9869    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9870        """
 9871        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9872        field in a VCF file and updates the INFO column of the variants table with the calculated
 9873        statistics.
 9874
 9875        :param info: The `info` parameter is a string that represents the type of information for which
 9876        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9877        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9878        maximum value, the mean, the median, defaults to VAF
 9879        :type info: str (optional)
 9880        """
 9881
 9882        # if FORMAT and samples
 9883        if (
 9884            "FORMAT" in self.get_header_columns_as_list()
 9885            and self.get_header_sample_list()
 9886        ):
 9887
 9888            # vaf_stats annotation field
 9889            vaf_stats_tag = info + "_stats"
 9890
 9891            # VCF infos tags
 9892            vcf_infos_tags = {
 9893                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9894                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9895                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9896                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9897                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9898                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9899                info
 9900                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9901            }
 9902
 9903            # Prefix
 9904            prefix = self.get_explode_infos_prefix()
 9905
 9906            # Field
 9907            vaf_stats_infos = prefix + vaf_stats_tag
 9908
 9909            # Variants table
 9910            table_variants = self.get_table_variants()
 9911
 9912            # Header
 9913            vcf_reader = self.get_header()
 9914
 9915            # Create variant id
 9916            variant_id_column = self.get_variant_id_column()
 9917            added_columns = [variant_id_column]
 9918
 9919            # variant_id, FORMAT and samples
 9920            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9921                self.get_header_sample_list()
 9922            )
 9923
 9924            # Create dataframe
 9925            dataframe_vaf_stats = self.get_query_to_df(
 9926                f""" SELECT {samples_fields} FROM {table_variants} """
 9927            )
 9928
 9929            # Create vaf_stats column
 9930            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9931                lambda row: genotype_stats(
 9932                    row, samples=self.get_header_sample_list(), info=info
 9933                ),
 9934                axis=1,
 9935            )
 9936
 9937            # List of vcf tags
 9938            sql_vaf_stats_fields = []
 9939
 9940            # Check all VAF stats infos
 9941            for stat in vcf_infos_tags:
 9942
 9943                # Extract stats
 9944                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9945                    lambda x: dict(x).get(stat, "")
 9946                )
 9947
 9948                # Add snpeff_hgvs to header
 9949                vcf_reader.infos[stat] = vcf.parser._Info(
 9950                    stat,
 9951                    ".",
 9952                    "String",
 9953                    vcf_infos_tags.get(stat, "genotype statistics"),
 9954                    "howard calculation",
 9955                    "0",
 9956                    self.code_type_map.get("String"),
 9957                )
 9958
 9959                if len(sql_vaf_stats_fields):
 9960                    sep = ";"
 9961                else:
 9962                    sep = ""
 9963
 9964                # Create fields to add in INFO
 9965                sql_vaf_stats_fields.append(
 9966                    f"""
 9967                        CASE
 9968                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9969                            THEN concat(
 9970                                    '{sep}{stat}=',
 9971                                    dataframe_vaf_stats."{stat}"
 9972                                )
 9973                            ELSE ''
 9974                        END
 9975                    """
 9976                )
 9977
 9978            # SQL set for update
 9979            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9980
 9981            # Update
 9982            sql_update = f"""
 9983                UPDATE {table_variants}
 9984                SET "INFO" = 
 9985                    concat(
 9986                        CASE
 9987                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9988                            THEN ''
 9989                            ELSE concat("INFO", ';')
 9990                        END,
 9991                        {sql_vaf_stats_fields_set}
 9992                    )
 9993                FROM dataframe_vaf_stats
 9994                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9995
 9996            """
 9997            self.conn.execute(sql_update)
 9998
 9999            # Remove added columns
10000            for added_column in added_columns:
10001                self.drop_column(column=added_column)
10002
10003            # Delete dataframe
10004            del dataframe_vaf_stats
10005            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
10007    def calculation_transcripts_annotation(
10008        self, info_json: str = None, info_format: str = None
10009    ) -> None:
10010        """
10011        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10012        field to it if transcripts are available.
10013
10014        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10015        is a string parameter that represents the information field to be used in the transcripts JSON.
10016        It is used to specify the JSON format for the transcripts information. If no value is provided
10017        when calling the method, it defaults to "
10018        :type info_json: str
10019        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10020        method is a string parameter that specifies the format of the information field to be used in
10021        the transcripts JSON. It is used to define the format of the information field
10022        :type info_format: str
10023        """
10024
10025        # Create transcripts table
10026        transcripts_table = self.create_transcript_view()
10027
10028        # Add info field
10029        if transcripts_table:
10030            self.transcript_view_to_variants(
10031                transcripts_table=transcripts_table,
10032                transcripts_info_field_json=info_json,
10033                transcripts_info_field_format=info_format,
10034            )
10035        else:
10036            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
10038    def calculation_transcripts_prioritization(self) -> None:
10039        """
10040        The function `calculation_transcripts_prioritization` creates a transcripts table and
10041        prioritizes transcripts based on certain criteria.
10042        """
10043
10044        # Create transcripts table
10045        transcripts_table = self.create_transcript_view()
10046
10047        # Add info field
10048        if transcripts_table:
10049            self.transcripts_prioritization(transcripts_table=transcripts_table)
10050        else:
10051            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
10053    def calculation_transcripts_export(self) -> None:
10054        """ """
10055
10056        # Create transcripts table
10057        transcripts_table = self.create_transcript_view()
10058
10059        # Add info field
10060        if transcripts_table:
10061            self.transcripts_export(transcripts_table=transcripts_table)
10062        else:
10063            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
10069    def transcripts_export(
10070        self, transcripts_table: str = None, param: dict = {}
10071    ) -> bool:
10072        """ """
10073
10074        log.debug("Start transcripts export...")
10075
10076        # Param
10077        if not param:
10078            param = self.get_param()
10079
10080        # Param export
10081        param_transcript_export = param.get("transcripts", {}).get("export", {})
10082
10083        # Output file
10084        transcripts_export_output = param_transcript_export.get("output", None)
10085
10086        if not param_transcript_export or not transcripts_export_output:
10087            log.warning(f"No transcriipts export parameters defined!")
10088            return False
10089
10090        # List of transcripts annotations
10091        query_describe = f"""
10092            SELECT column_name
10093            FROM (
10094                    DESCRIBE SELECT * FROM {transcripts_table}
10095                )
10096            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10097        """
10098        transcripts_annotations_list = list(
10099            self.get_query_to_df(query=query_describe)["column_name"]
10100        )
10101
10102        # Create transcripts table for export
10103        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10104            random.choices(string.ascii_uppercase + string.digits, k=10)
10105        )
10106        query_create_transcripts_table_export = f"""
10107            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10108        """
10109        self.execute_query(query=query_create_transcripts_table_export)
10110
10111        # Output file format
10112        transcripts_export_output_format = get_file_format(
10113            filename=transcripts_export_output
10114        )
10115
10116        # Format VCF - construct INFO
10117        if transcripts_export_output_format in ["vcf"]:
10118
10119            # Construct query update INFO and header
10120            query_update_info = []
10121            for field in transcripts_annotations_list:
10122
10123                # If field not in header
10124                if field not in self.get_header_infos_list():
10125
10126                    # Add PZ Transcript in header
10127                    self.get_header().infos[field] = vcf.parser._Info(
10128                        field,
10129                        ".",
10130                        "String",
10131                        f"Annotation '{field}' from transcript view",
10132                        "unknown",
10133                        "unknown",
10134                        0,
10135                    )
10136
10137                # Add field as INFO/tag
10138                query_update_info.append(
10139                    f"""
10140                        CASE
10141                            WHEN "{field}" IS NOT NULL
10142                            THEN concat('{field}=', "{field}", ';')    
10143                            ELSE ''     
10144                        END
10145                        """
10146                )
10147
10148            # Query param
10149            query_update_info_value = (
10150                f""" concat('',  {", ".join(query_update_info)}) """
10151            )
10152            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10153
10154        else:
10155
10156            # Query param
10157            query_update_info_value = f""" NULL """
10158            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10159
10160        # Update query INFO column
10161        query_update = f"""
10162            UPDATE {transcripts_table_export}
10163            SET INFO = {query_update_info_value}
10164
10165        """
10166        self.execute_query(query=query_update)
10167
10168        # Export
10169        self.export_output(
10170            output_file=transcripts_export_output,
10171            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10172        )
10173
10174        # Drop transcripts export table
10175        query_drop_transcripts_table_export = f"""
10176            DROP TABLE {transcripts_table_export}
10177        """
10178        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10180    def transcripts_prioritization(
10181        self, transcripts_table: str = None, param: dict = {}
10182    ) -> bool:
10183        """
10184        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10185        and updates the variants table with the prioritized information.
10186
10187        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10188        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10189        This parameter is used to identify the table where the transcripts data is stored for the
10190        prioritization process
10191        :type transcripts_table: str
10192        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10193        that contains various configuration settings for the prioritization process of transcripts. It
10194        is used to customize the behavior of the prioritization algorithm and includes settings such as
10195        the prefix for prioritization fields, default profiles, and other
10196        :type param: dict
10197        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10198        transcripts prioritization process is successfully completed, and `False` if there are any
10199        issues or if no profile is defined for transcripts prioritization.
10200        """
10201
10202        log.debug("Start transcripts prioritization...")
10203
10204        # Param
10205        if not param:
10206            param = self.get_param()
10207
10208        # Variants table
10209        table_variants = self.get_table_variants()
10210
10211        # Transcripts table
10212        if transcripts_table is None:
10213            transcripts_table = self.create_transcript_view(
10214                transcripts_table="transcripts", param=param
10215            )
10216        if transcripts_table is None:
10217            msg_err = "No Transcripts table availalble"
10218            log.error(msg_err)
10219            raise ValueError(msg_err)
10220        log.debug(f"transcripts_table={transcripts_table}")
10221
10222        # Get transcripts columns
10223        columns_as_list_query = f"""
10224            DESCRIBE {transcripts_table}
10225        """
10226        columns_as_list = list(
10227            self.get_query_to_df(columns_as_list_query)["column_name"]
10228        )
10229
10230        # Create INFO if not exists
10231        if "INFO" not in columns_as_list:
10232            query_add_info = f"""
10233                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10234            """
10235            self.execute_query(query_add_info)
10236
10237        # Prioritization param and Force only PZ Score and Flag
10238        pz_param = param.get("transcripts", {}).get("prioritization", {})
10239
10240        # PZ profile by default
10241        pz_profile_default = (
10242            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10243        )
10244
10245        # Exit if no profile
10246        if pz_profile_default is None:
10247            log.warning("No profile defined for transcripts prioritization")
10248            return False
10249
10250        # PZ fields
10251        pz_param_pzfields = {}
10252
10253        # PZ field transcripts
10254        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10255
10256        # Add PZ Transcript in header
10257        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10258            pz_fields_transcripts,
10259            ".",
10260            "String",
10261            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10262            "unknown",
10263            "unknown",
10264            code_type_map["String"],
10265        )
10266
10267        # Mandatory fields
10268        pz_mandatory_fields_list = [
10269            "Score",
10270            "Flag",
10271            "Tags",
10272            "Comment",
10273            "Infos",
10274            "Class",
10275        ]
10276        pz_mandatory_fields = []
10277        for pz_mandatory_field in pz_mandatory_fields_list:
10278            pz_mandatory_fields.append(
10279                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10280            )
10281
10282        # PZ fields in param
10283        for pz_field in pz_param.get("pzfields", []):
10284            if pz_field in pz_mandatory_fields_list:
10285                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10286                    pz_param.get("pzprefix", "PTZ") + pz_field
10287                )
10288            else:
10289                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10290                pz_param_pzfields[pz_field] = pz_field_new
10291
10292                # Add PZ Transcript in header
10293                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10294                    pz_field_new,
10295                    ".",
10296                    "String",
10297                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10298                    "unknown",
10299                    "unknown",
10300                    code_type_map["String"],
10301                )
10302
10303        # PZ fields param
10304        pz_param["pzfields"] = pz_mandatory_fields
10305
10306        # Prioritization
10307        prioritization_result = self.prioritization(
10308            table=transcripts_table,
10309            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10310        )
10311        if not prioritization_result:
10312            log.warning("Transcripts prioritization not processed")
10313            return False
10314
10315        # PZ fields sql query
10316        query_update_select_list = []
10317        query_update_concat_list = []
10318        query_update_order_list = []
10319        for pz_param_pzfield in set(
10320            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10321        ):
10322            query_update_select_list.append(f" {pz_param_pzfield}, ")
10323
10324        for pz_param_pzfield in pz_param_pzfields:
10325            query_update_concat_list.append(
10326                f"""
10327                    , CASE 
10328                        WHEN {pz_param_pzfield} IS NOT NULL
10329                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10330                        ELSE ''
10331                    END
10332                """
10333            )
10334
10335        # Order by
10336        pz_orders = (
10337            param.get("transcripts", {})
10338            .get("prioritization", {})
10339            .get("prioritization_transcripts_order", {})
10340        )
10341        if not pz_orders:
10342            pz_orders = {
10343                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10344                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10345            }
10346        for pz_order in pz_orders:
10347            query_update_order_list.append(
10348                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10349            )
10350
10351        # Fields to explode
10352        fields_to_explode = (
10353            list(pz_param_pzfields.keys())
10354            + pz_mandatory_fields
10355            + list(pz_orders.keys())
10356        )
10357        # Remove transcript column as a specific transcript column
10358        if "transcript" in fields_to_explode:
10359            fields_to_explode.remove("transcript")
10360
10361        # Fields intranscripts table
10362        query_transcripts_table = f"""
10363            DESCRIBE SELECT * FROM {transcripts_table}
10364        """
10365        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10366
10367        # Check fields to explode
10368        for field_to_explode in fields_to_explode:
10369            if field_to_explode not in self.get_header_infos_list() + list(
10370                query_transcripts_table.column_name
10371            ):
10372                msg_err = f"INFO/{field_to_explode} NOT IN header"
10373                log.error(msg_err)
10374                raise ValueError(msg_err)
10375
10376        # Explode fields to explode
10377        self.explode_infos(
10378            table=transcripts_table,
10379            fields=fields_to_explode,
10380        )
10381
10382        # Transcript preference file
10383        transcripts_preference_file = (
10384            param.get("transcripts", {})
10385            .get("prioritization", {})
10386            .get("prioritization_transcripts", {})
10387        )
10388        transcripts_preference_file = full_path(transcripts_preference_file)
10389
10390        # Transcript preference forced
10391        transcript_preference_force = (
10392            param.get("transcripts", {})
10393            .get("prioritization", {})
10394            .get("prioritization_transcripts_force", False)
10395        )
10396        # Transcript version forced
10397        transcript_version_force = (
10398            param.get("transcripts", {})
10399            .get("prioritization", {})
10400            .get("prioritization_transcripts_version_force", False)
10401        )
10402
10403        # Transcripts Ranking
10404        if transcripts_preference_file:
10405
10406            # Transcripts file to dataframe
10407            if os.path.exists(transcripts_preference_file):
10408                transcripts_preference_dataframe = transcripts_file_to_df(
10409                    transcripts_preference_file
10410                )
10411            else:
10412                log.error(
10413                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10414                )
10415                raise ValueError(
10416                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10417                )
10418
10419            # Order by depending to transcript preference forcing
10420            if transcript_preference_force:
10421                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10422            else:
10423                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10424
10425            # Transcript columns joined depend on version consideration
10426            if transcript_version_force:
10427                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10428            else:
10429                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10430
10431            # Query ranking for update
10432            query_update_ranking = f"""
10433                SELECT
10434                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10435                    ROW_NUMBER() OVER (
10436                        PARTITION BY "#CHROM", POS, REF, ALT
10437                        ORDER BY {order_by}
10438                    ) AS rn
10439                FROM {transcripts_table}
10440                LEFT JOIN 
10441                    (
10442                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10443                        FROM transcripts_preference_dataframe
10444                    ) AS transcripts_preference
10445                ON {transcripts_version_join}
10446            """
10447
10448        else:
10449
10450            # Query ranking for update
10451            query_update_ranking = f"""
10452                SELECT
10453                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10454                    ROW_NUMBER() OVER (
10455                        PARTITION BY "#CHROM", POS, REF, ALT
10456                        ORDER BY {" , ".join(query_update_order_list)}
10457                    ) AS rn
10458                FROM {transcripts_table}
10459            """
10460
10461        # Export Transcripts prioritization infos to variants table
10462        query_update = f"""
10463            WITH RankedTranscripts AS (
10464                {query_update_ranking}
10465            )
10466            UPDATE {table_variants}
10467                SET
10468                INFO = CONCAT(CASE
10469                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10470                            THEN ''
10471                            ELSE concat("INFO", ';')
10472                        END,
10473                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10474                        )
10475            FROM
10476                RankedTranscripts
10477            WHERE
10478                rn = 1
10479                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10480                AND variants."POS" = RankedTranscripts."POS"
10481                AND variants."REF" = RankedTranscripts."REF"
10482                AND variants."ALT" = RankedTranscripts."ALT"     
10483        """
10484
10485        # log.debug(f"query_update={query_update}")
10486        self.execute_query(query=query_update)
10487
10488        # Return
10489        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10491    def create_transcript_view_from_columns_map(
10492        self,
10493        transcripts_table: str = "transcripts",
10494        columns_maps: dict = {},
10495        added_columns: list = [],
10496        temporary_tables: list = None,
10497        annotation_fields: list = None,
10498        column_rename: dict = {},
10499        column_clean: bool = False,
10500        column_case: str = None,
10501    ) -> tuple[list, list, list]:
10502        """
10503        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10504        specified columns mapping for transcripts data.
10505
10506        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10507        of the table where the transcripts data is stored or will be stored in the database. This table
10508        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10509        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10510        :type transcripts_table: str (optional)
10511        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10512        about how to map columns from a transcripts table to create a view. Each entry in the
10513        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10514        typically includes details such as the main transcript column and additional information columns
10515        :type columns_maps: dict
10516        :param added_columns: The `added_columns` parameter in the
10517        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10518        that will be added to the view being created based on the columns map provided. These columns
10519        are generated by exploding the transcript information columns along with the main transcript
10520        column
10521        :type added_columns: list
10522        :param temporary_tables: The `temporary_tables` parameter in the
10523        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10524        tables created during the process of creating a transcript view from a columns map. These
10525        temporary tables are used to store intermediate results or transformations before the final view
10526        is generated
10527        :type temporary_tables: list
10528        :param annotation_fields: The `annotation_fields` parameter in the
10529        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10530        used for annotation in the query view creation process. These fields are extracted from the
10531        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10532        :type annotation_fields: list
10533        :param column_rename: The `column_rename` parameter in the
10534        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10535        custom renaming for columns during the creation of the temporary table view. This parameter
10536        provides a mapping of original column names to the desired renamed column names. By using this
10537        parameter,
10538        :type column_rename: dict
10539        :param column_clean: The `column_clean` parameter in the
10540        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10541        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10542        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10543        False
10544        :type column_clean: bool (optional)
10545        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10546        function is used to specify the case transformation to be applied to the columns during the view
10547        creation process. It allows you to control whether the column values should be converted to
10548        lowercase, uppercase, or remain unchanged
10549        :type column_case: str
10550        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10551        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10552        """
10553
10554        log.debug("Start transcrpts view creation from columns map...")
10555
10556        # "from_columns_map": [
10557        #     {
10558        #         "transcripts_column": "Ensembl_transcriptid",
10559        #         "transcripts_infos_columns": [
10560        #             "genename",
10561        #             "Ensembl_geneid",
10562        #             "LIST_S2_score",
10563        #             "LIST_S2_pred",
10564        #         ],
10565        #     },
10566        #     {
10567        #         "transcripts_column": "Ensembl_transcriptid",
10568        #         "transcripts_infos_columns": [
10569        #             "genename",
10570        #             "VARITY_R_score",
10571        #             "Aloft_pred",
10572        #         ],
10573        #     },
10574        # ],
10575
10576        # Init
10577        if temporary_tables is None:
10578            temporary_tables = []
10579        if annotation_fields is None:
10580            annotation_fields = []
10581
10582        # Variants table
10583        table_variants = self.get_table_variants()
10584
10585        for columns_map in columns_maps:
10586
10587            # Transcript column
10588            transcripts_column = columns_map.get("transcripts_column", None)
10589
10590            # Transcripts infos columns
10591            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10592
10593            # Transcripts infos columns rename
10594            column_rename = columns_map.get("column_rename", column_rename)
10595
10596            # Transcripts infos columns clean
10597            column_clean = columns_map.get("column_clean", column_clean)
10598
10599            # Transcripts infos columns case
10600            column_case = columns_map.get("column_case", column_case)
10601
10602            if transcripts_column is not None:
10603
10604                # Explode
10605                added_columns += self.explode_infos(
10606                    fields=[transcripts_column] + transcripts_infos_columns
10607                )
10608
10609                # View clauses
10610                clause_select_variants = []
10611                clause_select_tanscripts = []
10612                for field in [transcripts_column] + transcripts_infos_columns:
10613
10614                    # AS field
10615                    as_field = field
10616
10617                    # Rename
10618                    if column_rename:
10619                        as_field = column_rename.get(as_field, as_field)
10620
10621                    # Clean
10622                    if column_clean:
10623                        as_field = clean_annotation_field(as_field)
10624
10625                    # Case
10626                    if column_case:
10627                        if column_case.lower() in ["lower"]:
10628                            as_field = as_field.lower()
10629                        elif column_case.lower() in ["upper"]:
10630                            as_field = as_field.upper()
10631
10632                    # Clause select Variants
10633                    clause_select_variants.append(
10634                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10635                    )
10636
10637                    if field in [transcripts_column]:
10638                        clause_select_tanscripts.append(
10639                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10640                        )
10641                    else:
10642                        clause_select_tanscripts.append(
10643                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10644                        )
10645                        annotation_fields.append(as_field)
10646
10647                # Querey View
10648                query = f""" 
10649                    SELECT
10650                        "#CHROM", POS, REF, ALT, INFO,
10651                        "{transcripts_column}" AS 'transcript',
10652                        {", ".join(clause_select_tanscripts)}
10653                    FROM (
10654                        SELECT 
10655                            "#CHROM", POS, REF, ALT, INFO,
10656                            {", ".join(clause_select_variants)}
10657                        FROM {table_variants}
10658                        )
10659                    WHERE "{transcripts_column}" IS NOT NULL
10660                """
10661
10662                # Create temporary table
10663                temporary_table = transcripts_table + "".join(
10664                    random.choices(string.ascii_uppercase + string.digits, k=10)
10665                )
10666
10667                # Temporary_tables
10668                temporary_tables.append(temporary_table)
10669                query_view = f"""
10670                    CREATE TEMPORARY TABLE {temporary_table}
10671                    AS ({query})
10672                """
10673                self.execute_query(query=query_view)
10674
10675        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10677    def create_transcript_view_from_column_format(
10678        self,
10679        transcripts_table: str = "transcripts",
10680        column_formats: dict = {},
10681        temporary_tables: list = None,
10682        annotation_fields: list = None,
10683        column_rename: dict = {},
10684        column_clean: bool = False,
10685        column_case: str = None,
10686    ) -> tuple[list, list, list]:
10687        """
10688        The `create_transcript_view_from_column_format` function generates a transcript view based on
10689        specified column formats, adds additional columns and annotation fields, and returns the list of
10690        temporary tables and annotation fields.
10691
10692        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10693        of the table containing the transcripts data. This table will be used as the base table for
10694        creating the transcript view. The default value for this parameter is "transcripts", but you can
10695        provide a different table name if needed, defaults to transcripts
10696        :type transcripts_table: str (optional)
10697        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10698        about the columns to be used for creating the transcript view. Each entry in the dictionary
10699        specifies the mapping between a transcripts column and a transcripts infos column. This
10700        parameter allows you to define how the columns from the transcripts table should be transformed
10701        or mapped
10702        :type column_formats: dict
10703        :param temporary_tables: The `temporary_tables` parameter in the
10704        `create_transcript_view_from_column_format` function is a list that stores the names of
10705        temporary views created during the process of creating a transcript view from a column format.
10706        These temporary views are used to manipulate and extract data before generating the final
10707        transcript view
10708        :type temporary_tables: list
10709        :param annotation_fields: The `annotation_fields` parameter in the
10710        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10711        that are extracted from the temporary views created during the process. These annotation fields
10712        are obtained by querying the temporary views and extracting the column names excluding specific
10713        columns like `#CH
10714        :type annotation_fields: list
10715        :param column_rename: The `column_rename` parameter in the
10716        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10717        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10718        column names to new column names in this dictionary, you can rename specific columns during the
10719        process
10720        :type column_rename: dict
10721        :param column_clean: The `column_clean` parameter in the
10722        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10723        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10724        will be cleaned during the creation of the transcript view based on the specified column format,
10725        defaults to False
10726        :type column_clean: bool (optional)
10727        :param column_case: The `column_case` parameter in the
10728        `create_transcript_view_from_column_format` function is used to specify the case transformation
10729        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10730        to convert the column names to uppercase or lowercase, respectively
10731        :type column_case: str
10732        :return: The `create_transcript_view_from_column_format` function returns two lists:
10733        `temporary_tables` and `annotation_fields`.
10734        """
10735
10736        log.debug("Start transcrpts view creation from column format...")
10737
10738        #  "from_column_format": [
10739        #     {
10740        #         "transcripts_column": "ANN",
10741        #         "transcripts_infos_column": "Feature_ID",
10742        #     }
10743        # ],
10744
10745        # Init
10746        if temporary_tables is None:
10747            temporary_tables = []
10748        if annotation_fields is None:
10749            annotation_fields = []
10750
10751        for column_format in column_formats:
10752
10753            # annotation field and transcript annotation field
10754            annotation_field = column_format.get("transcripts_column", "ANN")
10755            transcript_annotation = column_format.get(
10756                "transcripts_infos_column", "Feature_ID"
10757            )
10758
10759            # Transcripts infos columns rename
10760            column_rename = column_format.get("column_rename", column_rename)
10761
10762            # Transcripts infos columns clean
10763            column_clean = column_format.get("column_clean", column_clean)
10764
10765            # Transcripts infos columns case
10766            column_case = column_format.get("column_case", column_case)
10767
10768            # Temporary View name
10769            temporary_view_name = transcripts_table + "".join(
10770                random.choices(string.ascii_uppercase + string.digits, k=10)
10771            )
10772
10773            # Create temporary view name
10774            temporary_view_name = self.annotation_format_to_table(
10775                uniquify=True,
10776                annotation_field=annotation_field,
10777                view_name=temporary_view_name,
10778                annotation_id=transcript_annotation,
10779                column_rename=column_rename,
10780                column_clean=column_clean,
10781                column_case=column_case,
10782            )
10783
10784            # Annotation fields
10785            if temporary_view_name:
10786                query_annotation_fields = f"""
10787                    SELECT *
10788                    FROM (
10789                        DESCRIBE SELECT *
10790                        FROM {temporary_view_name}
10791                        )
10792                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10793                """
10794                df_annotation_fields = self.get_query_to_df(
10795                    query=query_annotation_fields
10796                )
10797
10798                # Add temporary view and annotation fields
10799                temporary_tables.append(temporary_view_name)
10800                annotation_fields += list(set(df_annotation_fields["column_name"]))
10801
10802        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = False, param: dict = {}) -> str:
10804    def create_transcript_view(
10805        self,
10806        transcripts_table: str = None,
10807        transcripts_table_drop: bool = False,
10808        param: dict = {},
10809    ) -> str:
10810        """
10811        The `create_transcript_view` function generates a transcript view by processing data from a
10812        specified table based on provided parameters and structural information.
10813
10814        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10815        is used to specify the name of the table that will store the final transcript view data. If a table
10816        name is not provided, the function will create a new table to store the transcript view data, and by
10817        default,, defaults to transcripts
10818        :type transcripts_table: str (optional)
10819        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10820        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10821        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10822        the function will drop the existing transcripts table if it exists, defaults to False
10823        :type transcripts_table_drop: bool (optional)
10824        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10825        contains information needed to create a transcript view. It includes details such as the structure
10826        of the transcripts, columns mapping, column formats, and other necessary information for generating
10827        the view. This parameter allows for flexibility and customization
10828        :type param: dict
10829        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10830        created or modified during the execution of the function.
10831        """
10832
10833        log.debug("Start transcripts view creation...")
10834
10835        # Default
10836        transcripts_table_default = "transcripts"
10837
10838        # Param
10839        if not param:
10840            param = self.get_param()
10841
10842        # Struct
10843        struct = param.get("transcripts", {}).get("struct", None)
10844
10845        # Transcript veresion
10846        transcript_id_remove_version = param.get("transcripts", {}).get(
10847            "transcript_id_remove_version", False
10848        )
10849
10850        # Transcripts mapping
10851        transcript_id_mapping_file = param.get("transcripts", {}).get(
10852            "transcript_id_mapping_file", None
10853        )
10854
10855        # Transcripts mapping
10856        transcript_id_mapping_force = param.get("transcripts", {}).get(
10857            "transcript_id_mapping_force", None
10858        )
10859
10860        if struct:
10861
10862            # Transcripts table
10863            if transcripts_table is None:
10864                transcripts_table = param.get("transcripts", {}).get(
10865                    "table", transcripts_table_default
10866                )
10867
10868            # added_columns
10869            added_columns = []
10870
10871            # Temporary tables
10872            temporary_tables = []
10873
10874            # Annotation fields
10875            annotation_fields = []
10876
10877            # from columns map
10878            columns_maps = struct.get("from_columns_map", [])
10879            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10880                self.create_transcript_view_from_columns_map(
10881                    transcripts_table=transcripts_table,
10882                    columns_maps=columns_maps,
10883                    added_columns=added_columns,
10884                    temporary_tables=temporary_tables,
10885                    annotation_fields=annotation_fields,
10886                )
10887            )
10888            added_columns += added_columns_tmp
10889            temporary_tables += temporary_tables_tmp
10890            annotation_fields += annotation_fields_tmp
10891
10892            # from column format
10893            column_formats = struct.get("from_column_format", [])
10894            temporary_tables_tmp, annotation_fields_tmp = (
10895                self.create_transcript_view_from_column_format(
10896                    transcripts_table=transcripts_table,
10897                    column_formats=column_formats,
10898                    temporary_tables=temporary_tables,
10899                    annotation_fields=annotation_fields,
10900                )
10901            )
10902            temporary_tables += temporary_tables_tmp
10903            annotation_fields += annotation_fields_tmp
10904
10905            # Remove some specific fields/column
10906            annotation_fields = list(set(annotation_fields))
10907            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10908                if field in annotation_fields:
10909                    annotation_fields.remove(field)
10910
10911            # Merge temporary tables query
10912            query_merge = ""
10913            for temporary_table in list(set(temporary_tables)):
10914
10915                # First temporary table
10916                if not query_merge:
10917                    query_merge = f"""
10918                        SELECT * FROM {temporary_table}
10919                    """
10920                # other temporary table (using UNION)
10921                else:
10922                    query_merge += f"""
10923                        UNION BY NAME SELECT * FROM {temporary_table}
10924                    """
10925
10926            # transcript table tmp
10927            transcript_table_tmp = "transcripts_tmp"
10928            transcript_table_tmp2 = "transcripts_tmp2"
10929            transcript_table_tmp3 = "transcripts_tmp3"
10930
10931            # Merge on transcript
10932            query_merge_on_transcripts_annotation_fields = []
10933
10934            # Add transcript list
10935            query_merge_on_transcripts_annotation_fields.append(
10936                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10937            )
10938
10939            # Aggregate all annotations fields
10940            for annotation_field in set(annotation_fields):
10941                query_merge_on_transcripts_annotation_fields.append(
10942                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10943                )
10944
10945            # Transcripts mapping
10946            if transcript_id_mapping_file:
10947
10948                # Transcript dataframe
10949                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10950                transcript_id_mapping_dataframe = transcripts_file_to_df(
10951                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10952                )
10953
10954                # Transcript version remove
10955                if transcript_id_remove_version:
10956                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10957                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10958                    query_left_join = f"""
10959                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10960                    """
10961                else:
10962                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10963                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10964                    query_left_join = f"""
10965                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10966                    """
10967
10968                # Transcript column for group by merge
10969                query_transcript_merge_group_by = """
10970                        CASE
10971                            WHEN transcript_mapped NOT IN ('')
10972                            THEN split_part(transcript_mapped, '.', 1)
10973                            ELSE split_part(transcript_original, '.', 1)
10974                        END
10975                    """
10976
10977                # Merge query
10978                transcripts_tmp2_query = f"""
10979                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10980                    FROM ({query_merge}) AS {transcript_table_tmp}
10981                    {query_left_join}
10982                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10983                """
10984
10985                # Retrive columns after mege
10986                transcripts_tmp2_describe_query = f"""
10987                    DESCRIBE {transcripts_tmp2_query}
10988                """
10989                transcripts_tmp2_describe_list = list(
10990                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10991                        "column_name"
10992                    ]
10993                )
10994
10995                # Create list of columns for select clause
10996                transcripts_tmp2_describe_select_clause = []
10997                for field in transcripts_tmp2_describe_list:
10998                    if field not in [
10999                        "#CHROM",
11000                        "POS",
11001                        "REF",
11002                        "ALT",
11003                        "INFO",
11004                        "transcript_mapped",
11005                    ]:
11006                        as_field = field
11007                        if field in ["transcript_original"]:
11008                            as_field = "transcripts_mapped"
11009                        transcripts_tmp2_describe_select_clause.append(
11010                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11011                        )
11012
11013                # Merge with mapping
11014                query_merge_on_transcripts = f"""
11015                    SELECT
11016                        "#CHROM", POS, REF, ALT, INFO,
11017                        CASE
11018                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11019                            THEN ANY_VALUE(transcript_mapped)
11020                            ELSE ANY_VALUE(transcript_original)
11021                        END AS transcript,
11022                        {", ".join(transcripts_tmp2_describe_select_clause)}
11023                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11024                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11025                        {query_transcript_merge_group_by}
11026                """
11027
11028                # Add transcript filter from mapping file
11029                if transcript_id_mapping_force:
11030                    query_merge_on_transcripts = f"""
11031                        SELECT *
11032                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11033                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11034                    """
11035
11036            # No transcript mapping
11037            else:
11038
11039                # Remove transcript version
11040                if transcript_id_remove_version:
11041                    query_transcript_column = f"""
11042                        split_part({transcript_table_tmp}.transcript, '.', 1)
11043                    """
11044                else:
11045                    query_transcript_column = """
11046                        transcript
11047                    """
11048
11049                # Query sections
11050                query_transcript_column_select = (
11051                    f"{query_transcript_column} AS transcript"
11052                )
11053                query_transcript_column_group_by = query_transcript_column
11054
11055                # Query for transcripts view
11056                query_merge_on_transcripts = f"""
11057                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11058                    FROM ({query_merge}) AS {transcript_table_tmp}
11059                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11060                """
11061
11062            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11063
11064            # Drop transcript view is necessary
11065            if transcripts_table_drop:
11066                query_drop = f"""
11067                    DROP TABLE IF EXISTS {transcripts_table};
11068                """
11069                self.execute_query(query=query_drop)
11070
11071            # Merge and create transcript view
11072            query_create_view = f"""
11073                CREATE TABLE IF NOT EXISTS {transcripts_table}
11074                AS {query_merge_on_transcripts}
11075            """
11076            self.execute_query(query=query_create_view)
11077
11078            # Remove added columns
11079            for added_column in added_columns:
11080                self.drop_column(column=added_column)
11081
11082        else:
11083
11084            transcripts_table = None
11085
11086        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to False
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11088    def annotation_format_to_table(
11089        self,
11090        uniquify: bool = True,
11091        annotation_field: str = "ANN",
11092        annotation_id: str = "Feature_ID",
11093        view_name: str = "transcripts",
11094        column_rename: dict = {},
11095        column_clean: bool = False,
11096        column_case: str = None,
11097    ) -> str:
11098        """
11099        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11100        structured table format, ensuring unique values and creating a temporary table for further
11101        processing or analysis.
11102
11103        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11104        unique values in the output or not. If set to `True`, the function will make sure that the
11105        output values are unique, defaults to True
11106        :type uniquify: bool (optional)
11107        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11108        that contains the annotation information for each variant. This field is used to extract the
11109        annotation details for further processing in the function. By default, it is set to "ANN",
11110        defaults to ANN
11111        :type annotation_field: str (optional)
11112        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11113        is used to specify the identifier for the annotation feature. This identifier will be used as a
11114        column name in the resulting table or view that is created based on the annotation data. It
11115        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11116        :type annotation_id: str (optional)
11117        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11118        to specify the name of the temporary table that will be created to store the transformed
11119        annotation data. This table will hold the extracted information from the annotation field in a
11120        structured format for further processing or analysis. By default,, defaults to transcripts
11121        :type view_name: str (optional)
11122        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11123        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11124        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11125        created based on the annotation data. This feature enables
11126        :type column_rename: dict
11127        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11128        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11129        If set to `True`, the function will clean the annotation field before further processing. This
11130        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11131        to False
11132        :type column_clean: bool (optional)
11133        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11134        used to specify the case transformation to be applied to the column names extracted from the
11135        annotation data. It allows you to set the case of the column names to either lowercase or
11136        uppercase for consistency or other specific requirements during the conversion
11137        :type column_case: str
11138        :return: The function `annotation_format_to_table` is returning the name of the view created,
11139        which is stored in the variable `view_name`.
11140        """
11141
11142        # Annotation field
11143        annotation_format = "annotation_explode"
11144
11145        # Transcript annotation
11146        if column_rename:
11147            annotation_id = column_rename.get(annotation_id, annotation_id)
11148
11149        if column_clean:
11150            annotation_id = clean_annotation_field(annotation_id)
11151
11152        # Prefix
11153        prefix = self.get_explode_infos_prefix()
11154        if prefix:
11155            prefix = "INFO/"
11156
11157        # Annotation fields
11158        annotation_infos = prefix + annotation_field
11159        annotation_format_infos = prefix + annotation_format
11160
11161        # Variants table
11162        table_variants = self.get_table_variants()
11163
11164        # Header
11165        vcf_reader = self.get_header()
11166
11167        # Add columns
11168        added_columns = []
11169
11170        # Explode HGVS field in column
11171        added_columns += self.explode_infos(fields=[annotation_field])
11172
11173        if annotation_field in vcf_reader.infos:
11174
11175            # Extract ANN header
11176            ann_description = vcf_reader.infos[annotation_field].desc
11177            pattern = r"'(.+?)'"
11178            match = re.search(pattern, ann_description)
11179            if match:
11180                ann_header_match = match.group(1).split(" | ")
11181                ann_header = []
11182                ann_header_desc = {}
11183                for i in range(len(ann_header_match)):
11184                    ann_header_info = "".join(
11185                        char for char in ann_header_match[i] if char.isalnum()
11186                    )
11187                    ann_header.append(ann_header_info)
11188                    ann_header_desc[ann_header_info] = ann_header_match[i]
11189                if not ann_header_desc:
11190                    raise ValueError("Invalid header description format")
11191            else:
11192                raise ValueError("Invalid header description format")
11193
11194            # Create variant id
11195            variant_id_column = self.get_variant_id_column()
11196            added_columns += [variant_id_column]
11197
11198            # Create dataframe
11199            dataframe_annotation_format = self.get_query_to_df(
11200                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11201            )
11202
11203            # Create annotation columns
11204            dataframe_annotation_format[
11205                annotation_format_infos
11206            ] = dataframe_annotation_format[annotation_infos].apply(
11207                lambda x: explode_annotation_format(
11208                    annotation=str(x),
11209                    uniquify=uniquify,
11210                    output_format="JSON",
11211                    prefix="",
11212                    header=list(ann_header_desc.values()),
11213                )
11214            )
11215
11216            # Find keys
11217            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11218            df_keys = self.get_query_to_df(query=query_json)
11219
11220            # Check keys
11221            query_json_key = []
11222            for _, row in df_keys.iterrows():
11223
11224                # Key
11225                key = row.iloc[0]
11226                key_clean = key
11227
11228                # key rename
11229                if column_rename:
11230                    key_clean = column_rename.get(key_clean, key_clean)
11231
11232                # key clean
11233                if column_clean:
11234                    key_clean = clean_annotation_field(key_clean)
11235
11236                # Key case
11237                if column_case:
11238                    if column_case.lower() in ["lower"]:
11239                        key_clean = key_clean.lower()
11240                    elif column_case.lower() in ["upper"]:
11241                        key_clean = key_clean.upper()
11242
11243                # Type
11244                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11245
11246                # Get DataFrame from query
11247                df_json_type = self.get_query_to_df(query=query_json_type)
11248
11249                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11250                with pd.option_context("future.no_silent_downcasting", True):
11251                    df_json_type.fillna(value="", inplace=True)
11252                    replace_dict = {None: np.nan, "": np.nan}
11253                    df_json_type.replace(replace_dict, inplace=True)
11254                    df_json_type.dropna(inplace=True)
11255
11256                # Detect column type
11257                column_type = detect_column_type(df_json_type[key_clean])
11258
11259                # Append
11260                query_json_key.append(
11261                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11262                )
11263
11264            # Create view
11265            query_view = f"""
11266                CREATE TEMPORARY TABLE {view_name}
11267                AS (
11268                    SELECT *, {annotation_id} AS 'transcript'
11269                    FROM (
11270                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11271                        FROM dataframe_annotation_format
11272                        )
11273                    );
11274            """
11275            self.execute_query(query=query_view)
11276
11277        else:
11278
11279            # Return None
11280            view_name = None
11281
11282        # Remove added columns
11283        for added_column in added_columns:
11284            self.drop_column(column=added_column)
11285
11286        return view_name

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11288    def transcript_view_to_variants(
11289        self,
11290        transcripts_table: str = None,
11291        transcripts_column_id: str = None,
11292        transcripts_info_json: str = None,
11293        transcripts_info_field_json: str = None,
11294        transcripts_info_format: str = None,
11295        transcripts_info_field_format: str = None,
11296        param: dict = {},
11297    ) -> bool:
11298        """
11299        The `transcript_view_to_variants` function updates a variants table with information from
11300        transcripts in JSON format.
11301
11302        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11303        table containing the transcripts data. If this parameter is not provided, the function will
11304        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11305        :type transcripts_table: str
11306        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11307        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11308        identifier is used to match transcripts with variants in the database
11309        :type transcripts_column_id: str
11310        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11311        of the column in the variants table where the transcripts information will be stored in JSON
11312        format. This parameter allows you to define the column in the variants table that will hold the
11313        JSON-formatted information about transcripts
11314        :type transcripts_info_json: str
11315        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11316        specify the field in the VCF header that will contain information about transcripts in JSON
11317        format. This field will be added to the VCF header as an INFO field with the specified name
11318        :type transcripts_info_field_json: str
11319        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11320        format of the information about transcripts that will be stored in the variants table. This
11321        format can be used to define how the transcript information will be structured or displayed
11322        within the variants table
11323        :type transcripts_info_format: str
11324        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11325        specify the field in the VCF header that will contain information about transcripts in a
11326        specific format. This field will be added to the VCF header as an INFO field with the specified
11327        name
11328        :type transcripts_info_field_format: str
11329        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11330        that contains various configuration settings related to transcripts. It is used to provide
11331        default values for certain parameters if they are not explicitly provided when calling the
11332        method. The `param` dictionary can be passed as an argument
11333        :type param: dict
11334        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11335        if the operation is successful and `False` if certain conditions are not met.
11336        """
11337
11338        msg_info_prefix = "Start transcripts view to variants annotations"
11339
11340        log.debug(f"{msg_info_prefix}...")
11341
11342        # Default
11343        transcripts_table_default = "transcripts"
11344        transcripts_column_id_default = "transcript"
11345        transcripts_info_json_default = None
11346        transcripts_info_format_default = None
11347        transcripts_info_field_json_default = None
11348        transcripts_info_field_format_default = None
11349
11350        # Param
11351        if not param:
11352            param = self.get_param()
11353
11354        # Transcripts table
11355        if transcripts_table is None:
11356            transcripts_table = param.get("transcripts", {}).get(
11357                "table", transcripts_table_default
11358            )
11359
11360        # Transcripts column ID
11361        if transcripts_column_id is None:
11362            transcripts_column_id = param.get("transcripts", {}).get(
11363                "column_id", transcripts_column_id_default
11364            )
11365
11366        # Transcripts info json
11367        if transcripts_info_json is None:
11368            transcripts_info_json = param.get("transcripts", {}).get(
11369                "transcripts_info_json", transcripts_info_json_default
11370            )
11371
11372        # Transcripts info field JSON
11373        if transcripts_info_field_json is None:
11374            transcripts_info_field_json = param.get("transcripts", {}).get(
11375                "transcripts_info_field_json", transcripts_info_field_json_default
11376            )
11377        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11378        #     transcripts_info_json = transcripts_info_field_json
11379
11380        # Transcripts info format
11381        if transcripts_info_format is None:
11382            transcripts_info_format = param.get("transcripts", {}).get(
11383                "transcripts_info_format", transcripts_info_format_default
11384            )
11385
11386        # Transcripts info field FORMAT
11387        if transcripts_info_field_format is None:
11388            transcripts_info_field_format = param.get("transcripts", {}).get(
11389                "transcripts_info_field_format", transcripts_info_field_format_default
11390            )
11391        # if (
11392        #     transcripts_info_field_format is not None
11393        #     and transcripts_info_format is None
11394        # ):
11395        #     transcripts_info_format = transcripts_info_field_format
11396
11397        # Variants table
11398        table_variants = self.get_table_variants()
11399
11400        # Check info columns param
11401        if (
11402            transcripts_info_json is None
11403            and transcripts_info_field_json is None
11404            and transcripts_info_format is None
11405            and transcripts_info_field_format is None
11406        ):
11407            return False
11408
11409        # Transcripts infos columns
11410        query_transcripts_infos_columns = f"""
11411            SELECT *
11412            FROM (
11413                DESCRIBE SELECT * FROM {transcripts_table}
11414                )
11415            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11416        """
11417        transcripts_infos_columns = list(
11418            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11419        )
11420
11421        # View results
11422        clause_select = []
11423        clause_to_json = []
11424        clause_to_format = []
11425        for field in transcripts_infos_columns:
11426            # Do not consider INFO field for export into fields
11427            if field not in ["INFO"]:
11428                clause_select.append(
11429                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11430                )
11431                clause_to_json.append(f""" '{field}': "{field}" """)
11432                clause_to_format.append(f""" "{field}" """)
11433
11434        # Update
11435        update_set_json = []
11436        update_set_format = []
11437
11438        # VCF header
11439        vcf_reader = self.get_header()
11440
11441        # Transcripts to info column in JSON
11442        if transcripts_info_json:
11443
11444            # Create column on variants table
11445            self.add_column(
11446                table_name=table_variants,
11447                column_name=transcripts_info_json,
11448                column_type="JSON",
11449                default_value=None,
11450                drop=False,
11451            )
11452
11453            # Add header
11454            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11455                transcripts_info_json,
11456                ".",
11457                "String",
11458                "Transcripts in JSON format",
11459                "unknwon",
11460                "unknwon",
11461                self.code_type_map["String"],
11462            )
11463
11464            # Add to update
11465            update_set_json.append(
11466                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11467            )
11468
11469        # Transcripts to info field in JSON
11470        if transcripts_info_field_json:
11471
11472            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11473
11474            # Add to update
11475            update_set_json.append(
11476                f""" 
11477                    INFO = concat(
11478                            CASE
11479                                WHEN INFO NOT IN ('', '.')
11480                                THEN INFO
11481                                ELSE ''
11482                            END,
11483                            CASE
11484                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11485                                THEN concat(
11486                                    ';{transcripts_info_field_json}=',
11487                                    t.{transcripts_info_json}
11488                                )
11489                                ELSE ''
11490                            END
11491                            )
11492                """
11493            )
11494
11495            # Add header
11496            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11497                transcripts_info_field_json,
11498                ".",
11499                "String",
11500                "Transcripts in JSON format",
11501                "unknwon",
11502                "unknwon",
11503                self.code_type_map["String"],
11504            )
11505
11506        if update_set_json:
11507
11508            # Update query
11509            query_update = f"""
11510                UPDATE {table_variants}
11511                    SET {", ".join(update_set_json)}
11512                FROM
11513                (
11514                    SELECT
11515                        "#CHROM", POS, REF, ALT,
11516                            concat(
11517                            '{{',
11518                            string_agg(
11519                                '"' || "{transcripts_column_id}" || '":' ||
11520                                to_json(json_output)
11521                            ),
11522                            '}}'
11523                            )::JSON AS {transcripts_info_json}
11524                    FROM
11525                        (
11526                        SELECT
11527                            "#CHROM", POS, REF, ALT,
11528                            "{transcripts_column_id}",
11529                            to_json(
11530                                {{{",".join(clause_to_json)}}}
11531                            )::JSON AS json_output
11532                        FROM
11533                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11534                        WHERE "{transcripts_column_id}" IS NOT NULL
11535                        )
11536                    GROUP BY "#CHROM", POS, REF, ALT
11537                ) AS t
11538                WHERE {table_variants}."#CHROM" = t."#CHROM"
11539                    AND {table_variants}."POS" = t."POS"
11540                    AND {table_variants}."REF" = t."REF"
11541                    AND {table_variants}."ALT" = t."ALT"
11542            """
11543
11544            self.execute_query(query=query_update)
11545
11546        # Transcripts to info column in FORMAT
11547        if transcripts_info_format:
11548
11549            # Create column on variants table
11550            self.add_column(
11551                table_name=table_variants,
11552                column_name=transcripts_info_format,
11553                column_type="VARCHAR",
11554                default_value=None,
11555                drop=False,
11556            )
11557
11558            # Add header
11559            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11560                transcripts_info_format,
11561                ".",
11562                "String",
11563                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11564                "unknwon",
11565                "unknwon",
11566                self.code_type_map["String"],
11567            )
11568
11569            # Add to update
11570            update_set_format.append(
11571                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11572            )
11573
11574        else:
11575
11576            # Set variable for internal queries
11577            transcripts_info_format = "transcripts_info_format"
11578
11579        # Transcripts to info field in JSON
11580        if transcripts_info_field_format:
11581
11582            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11583
11584            # Add to update
11585            update_set_format.append(
11586                f""" 
11587                    INFO = concat(
11588                            CASE
11589                                WHEN INFO NOT IN ('', '.')
11590                                THEN INFO
11591                                ELSE ''
11592                            END,
11593                            CASE
11594                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11595                                THEN concat(
11596                                    ';{transcripts_info_field_format}=',
11597                                    t.{transcripts_info_format}
11598                                )
11599                                ELSE ''
11600                            END
11601                            )
11602                """
11603            )
11604
11605            # Add header
11606            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11607                transcripts_info_field_format,
11608                ".",
11609                "String",
11610                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11611                "unknwon",
11612                "unknwon",
11613                self.code_type_map["String"],
11614            )
11615
11616        if update_set_format:
11617
11618            # Update query
11619            query_update = f"""
11620                UPDATE {table_variants}
11621                    SET {", ".join(update_set_format)}
11622                FROM
11623                (
11624                    SELECT
11625                        "#CHROM", POS, REF, ALT,
11626                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11627                    FROM 
11628                        (
11629                        SELECT
11630                            "#CHROM", POS, REF, ALT,
11631                            "{transcripts_column_id}",
11632                            concat(
11633                                "{transcripts_column_id}",
11634                                '|',
11635                                {", '|', ".join(clause_to_format)}
11636                            ) AS {transcripts_info_format}
11637                        FROM
11638                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11639                        )
11640                    GROUP BY "#CHROM", POS, REF, ALT
11641                ) AS t
11642                WHERE {table_variants}."#CHROM" = t."#CHROM"
11643                    AND {table_variants}."POS" = t."POS"
11644                    AND {table_variants}."REF" = t."REF"
11645                    AND {table_variants}."ALT" = t."ALT"
11646            """
11647
11648            self.execute_query(query=query_update)
11649
11650        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.

def rename_info_fields(self, fields_to_rename: dict = None, table: str = None) -> dict:
11652    def rename_info_fields(
11653        self, fields_to_rename: dict = None, table: str = None
11654    ) -> dict:
11655        """
11656        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11657        corresponding INFO fields in the variants table.
11658
11659        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11660        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11661        represent the original field names that need to be renamed, and the corresponding values
11662        represent the new names to which the fields should be
11663        :type fields_to_rename: dict
11664        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11665        the table in which the variants data is stored. This table contains information about genetic
11666        variants, and the function updates the corresponding INFO fields in this table when renaming
11667        specified fields in the VCF file header
11668        :type table: str
11669        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11670        the original field names as keys and their corresponding new names (or None if the field was
11671        removed) as values after renaming or removing specified fields in a VCF file header and updating
11672        corresponding INFO fields in the variants table.
11673        """
11674
11675        # Init
11676        fields_renamed = {}
11677        config = self.get_config()
11678        access = config.get("access")
11679
11680        if table is None:
11681            table = self.get_table_variants()
11682
11683        # regexp replace fonction
11684        regex_replace_dict = {}
11685        regex_replace_nb = 0
11686        regex_replace_partition = 125
11687        regex_replace = "INFO"
11688
11689        if fields_to_rename is not None and access not in ["RO"]:
11690
11691            log.info("Rename or remove fields...")
11692
11693            # Header
11694            header = self.get_header()
11695
11696            for field_to_rename, field_renamed in fields_to_rename.items():
11697
11698                if field_to_rename in header.infos:
11699
11700                    # Rename header
11701                    if field_renamed is not None:
11702                        header.infos[field_renamed] = vcf.parser._Info(
11703                            field_renamed,
11704                            header.infos[field_to_rename].num,
11705                            header.infos[field_to_rename].type,
11706                            header.infos[field_to_rename].desc,
11707                            header.infos[field_to_rename].source,
11708                            header.infos[field_to_rename].version,
11709                            header.infos[field_to_rename].type_code,
11710                        )
11711                    del header.infos[field_to_rename]
11712
11713                    # Rename INFO patterns
11714                    field_pattern = rf'(^|;)({field_to_rename})=([^;]*)'
11715                    if field_renamed is not None:
11716                        field_renamed_pattern = rf'\1{field_renamed}=\3'
11717                    else:
11718                        field_renamed_pattern = ''
11719
11720                    # regexp replace
11721                    regex_replace_nb += 1
11722                    regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition)
11723                    if (regex_replace_nb % regex_replace_partition) == 0:
11724                        regex_replace = "INFO"
11725                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11726                    regex_replace_dict[regex_replace_key] = regex_replace
11727
11728                    # Return
11729                    fields_renamed[field_to_rename] = field_renamed
11730
11731                    # Log
11732                    if field_renamed is not None:
11733                        log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'")
11734                    else:
11735                        log.info(f"Rename or remove fields - field '{field_to_rename}' removed")
11736
11737            # Rename INFO
11738            for regex_replace_key, regex_replace  in regex_replace_dict.items():
11739                log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...")
11740                query = f"""
11741                    UPDATE {table}
11742                    SET
11743                        INFO = {regex_replace}
11744                """
11745                log.debug(f"query={query}")
11746                self.execute_query(query=query)
11747
11748        return fields_renamed

The rename_info_fields function renames specified fields in a VCF file header and updates corresponding INFO fields in the variants table.

Parameters
  • fields_to_rename: The fields_to_rename parameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be
  • table: The table parameter in the rename_info_fields function represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns

The rename_info_fields function returns a dictionary fields_renamed that contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.

def calculation_rename_info_fields( self, fields_to_rename: dict = None, table: str = None, operation_name: str = 'RENAME_INFO_FIELDS') -> None:
11750    def calculation_rename_info_fields(
11751        self,
11752        fields_to_rename: dict = None,
11753        table: str = None,
11754        operation_name: str = "RENAME_INFO_FIELDS",
11755    ) -> None:
11756        """
11757        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11758        fields to rename and table if provided, and then calls another function to rename the fields.
11759
11760        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11761        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11762        the key and the new field name as the value
11763        :type fields_to_rename: dict
11764        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11765        specify the name of the table for which the fields are to be renamed. It is a string type
11766        parameter
11767        :type table: str
11768        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11769        method is a string that specifies the name of the operation being performed. In this context, it
11770        is used as a default value for the operation name if not explicitly provided when calling the
11771        function, defaults to RENAME_INFO_FIELDS
11772        :type operation_name: str (optional)
11773        """
11774
11775        # Param
11776        param = self.get_param()
11777
11778        # Get param fields to rename
11779        param_fields_to_rename = (
11780            param.get("calculation", {})
11781            .get("calculations", {})
11782            .get(operation_name, {})
11783            .get("fields_to_rename", None)
11784        )
11785
11786        # Get param table
11787        param_table = (
11788            param.get("calculation", {})
11789            .get("calculations", {})
11790            .get(operation_name, {})
11791            .get("table", None)
11792        )
11793
11794        # Init fields_to_rename
11795        if fields_to_rename is None:
11796            fields_to_rename = param_fields_to_rename
11797
11798        # Init table
11799        if table is None:
11800            table = param_table
11801
11802        renamed_fields = self.rename_info_fields(
11803            fields_to_rename=fields_to_rename, table=table
11804        )
11805
11806        log.debug(f"renamed_fields:{renamed_fields}")

The calculation_rename_info_fields function retrieves parameters from a dictionary, updates fields to rename and table if provided, and then calls another function to rename the fields.

Parameters
  • fields_to_rename: fields_to_rename is a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value
  • table: The table parameter in the calculation_rename_info_fields method is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter
  • operation_name: The operation_name parameter in the calculation_rename_info_fields method is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS